In [1]:
import nltk                # import the NLTK (Natural Language Toolkit) library
nltk.download('stopwords')              # Download stopwords list 
nltk.download('punkt')       #used for tokenizing text into sentences. 
import os
import re          # This is supportive of working with regular expressions
import csv
import string
import pandas as pd
from nltk.corpus import stopwords  #this is used to import the stopwords corpus from the NLTK library. 
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences    
from sklearn.preprocessing import LabelEncoder 
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alaa_Abdallah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Alaa_Abdallah\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
#Text preprocessing 
def preprocess_text(text):
    # Convert to lower case
    text = text.lower()

    # Removal of punctuations
    text = ''.join([char for char in text if char not in string.punctuation])

    # Removal of stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    text = ' '.join([word for word in words if word not in stop_words])

    # Remove numbers
    text = ''.join([char for char in text if not char.isdigit()])

    # Remove tabs and newline characters
    text = text.replace('\n', '').replace('\t', '')

    return text

In [5]:
# Word Tokenization
def tokenize_text(text):
    # Tokenize the text into uni-gram tokens(split the text)
    tokens = word_tokenize(text)
    return tokens

In [6]:
# Token Normalization 
def normalize_tokens(tokens):
    porter = PorterStemmer()       # Initialize Porter stemmer
    stemmed_tokens = [token for token in tokens] # Stem each token
    return stemmed_tokens

In [7]:
def extract_vocabulary(text_series):
    vocabulary_set = set()
    for text in text_series:
        # Preprocess text
        cleaned_text = preprocess_text(text)
        # Tokenize text
        tokens = tokenize_text(cleaned_text)
        # Normalize tokens
        stemmed_tokens = normalize_tokens(tokens)
        # Add tokens to vocabulary set
        vocabulary_set.update(stemmed_tokens)
    return vocabulary_set

In [8]:
def read_data_from_files(directory, output_csv):
    data = []
    labels = []
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['text', 'label'])       # Write the header row
        for class_name in os.listdir(directory):
            class_path = os.path.join(directory, class_name)
            if os.path.isdir(class_path):
                for filename in os.listdir(class_path):
                    file_path = os.path.join(class_path, filename)
                    with open(file_path, 'r', encoding='utf-8') as file:
                        text = file.read()
                        cleaned_text = preprocess_text(text)          # Preprocess text
                        tokens = tokenize_text(cleaned_text)          # Tokenize text
                        stemmed_tokens = normalize_tokens(tokens)     # Normalize tokens
                        writer.writerow([' '.join(stemmed_tokens), class_name])
                        # Collect data and labels for likelihood estimation
                        data.append(' '.join(stemmed_tokens))
                        labels.append(class_name)

    return data, labels

In [70]:
# learning phase:

In [10]:
def estimate_prior(labels):
    prior_distribution = {}
    total_samples = len(labels)
    for label in set(labels):
        prior_distribution[label] = labels.count(label) / total_samples
    return prior_distribution

In [11]:
def estimate_likelihood(data, labels):
    likelihood = {}
    total_samples = len(labels)
    vocabulary = set(word for sample in data for word in sample.split())
    for label in set(labels):
        likelihood[label] = {}
        class_data = [data[i] for i in range(total_samples) if labels[i] == label]
        word_counts = {}
        for sample in class_data:
            for word in sample.split():
                word_counts[word] = word_counts.get(word, 0) + 1
        for word in vocabulary:
            likelihood[label][word] = (word_counts.get(word, 0) + 1) / (len(class_data) + len(vocabulary))
    return likelihood

In [12]:
def write_prior_to_csv(prior_distribution, output_csv):
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Class', 'Prior Probability'])
        for label, prior_prob in prior_distribution.items():
            writer.writerow([label, prior_prob])

In [13]:
def write_likelihood_to_csv(likelihood, output_csv):
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Class', 'Word', 'Probability'])
        for label, word_prob_dict in likelihood.items():
            for word, prob in word_prob_dict.items():
                writer.writerow([label, word, prob])       

In [14]:
directory = 'training'  # folder containing my data

In [15]:
output_csv_data = 'data.csv'              # CSV file to write the data 

output_csv_prior = 'prior_distribution.csv'          # CSV file to write the prior distribution 
output_csv_likelihood = 'likelihood.csv'   # CSV file to write the likelihood 

# Read data 
data, labels = read_data_from_files(directory, output_csv_data)

In [16]:
#_code
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,text,label
0,computer terminal systems cpml completes sale ...,acq
1,ohio mattress omt may lower st qtr net clevela...,acq
2,chemlawn chem rises hopes higher bids author c...,acq
3,cofab inc buys gulfex undisclosed amount houst...,acq
4,investment firms cut cyclops cyl stake washing...,acq


In [17]:
train, test = train_test_split(df, test_size=0.2)

In [18]:
train.head()

Unnamed: 0,text,label
6079,ec rejects wheat export bids grants barley bru...,grain
2909,bdm international bdm increases qtrly divs mcl...,earn
3672,intelligent systems mlp inp rd qtr dec net nor...,earn
7662,bank japan intervenes soon tokyo opening tokyo...,money-fx
11242,usda rejects sri lankas us dlr wheat price col...,wheat


In [19]:
test.head()

Unnamed: 0,text,label
2945,japans ntt forecasts profits fall tokyo march ...,earn
5674,jack winter inc jwi th qtr jan loss milwaukee ...,earn
5240,honeywell bull sees revenue growth new york ma...,earn
3542,iomega iomg sets management labor layoffs roy ...,earn
8567,usda comments export sales washington march co...,soybean


In [20]:
print(df.shape)
print(train.shape)
print(test.shape)

(11393, 2)
(9114, 2)
(2279, 2)


data =train['text']
labels =train['label']

-------
actual_labels = test['label']


In [21]:
# Extract vocabulary from the training dataset
vocabulary_set_train = extract_vocabulary(train['text'])

In [22]:
print("Size of Vocabulary Set:", len(vocabulary_set_train))

Size of Vocabulary Set: 29072


In [23]:
# Estimate prior distribution on the training dataset
train_labels = train['label'].tolist()
prior_distribution_train = estimate_prior(train_labels)

# Write prior distribution on the training dataset to CSV
write_prior_to_csv(prior_distribution_train, output_csv_prior)

In [24]:
train['label']

6079            grain
2909             earn
3672             earn
7662         money-fx
11242           wheat
             ...     
7707     money-supply
10685         unknown
702               acq
5727             earn
6376            grain
Name: label, Length: 9114, dtype: object

In [25]:
test['label']

2945         earn
5674         earn
5240         earn
3542         earn
8567      soybean
           ...   
136           acq
9919      unknown
4846         earn
6559     interest
11082     veg-oil
Name: label, Length: 2279, dtype: object

In [26]:
test.reset_index(drop=True, inplace=True)

In [27]:
print(train.columns)
print(train.head())

Index(['text', 'label'], dtype='object')
                                                    text     label
6079   ec rejects wheat export bids grants barley bru...     grain
2909   bdm international bdm increases qtrly divs mcl...      earn
3672   intelligent systems mlp inp rd qtr dec net nor...      earn
7662   bank japan intervenes soon tokyo opening tokyo...  money-fx
11242  usda rejects sri lankas us dlr wheat price col...     wheat


In [28]:
# Reset indices of the DataFrame
train.reset_index(drop=True, inplace=True)

# Estimate likelihood on the training dataset
likelihood_train = estimate_likelihood(train['text'], train['label'])

# Write likelihood to CSV
write_likelihood_to_csv(likelihood_train, output_csv_likelihood)

In [29]:
print(train.columns)
print(train.head())

Index(['text', 'label'], dtype='object')
                                                text     label
0  ec rejects wheat export bids grants barley bru...     grain
1  bdm international bdm increases qtrly divs mcl...      earn
2  intelligent systems mlp inp rd qtr dec net nor...      earn
3  bank japan intervenes soon tokyo opening tokyo...  money-fx
4  usda rejects sri lankas us dlr wheat price col...     wheat


In [30]:
# inference 'prediction phase'

In [31]:
def predict_sentiment(text, prior_distribution, likelihood):
    # Preprocess the text
    cleaned_text = preprocess_text(text)
    # Tokenize the text
    tokens = tokenize_text(cleaned_text)
    # Normalize the tokens
    stemmed_tokens = normalize_tokens(tokens)
    
    # Initialize probabilities for each class
    class_probabilities = {}
    
    # Calculate the probability of each class
    for label, prior_prob in prior_distribution.items():
        # Initialize probability with prior
        class_probability = prior_prob
        for word in stemmed_tokens:
            # Update probability using likelihood
            if word in likelihood[label]:
                class_probability *= likelihood[label][word]
        class_probabilities[label] = class_probability
    
    # Determine the predicted sentiment (class) with the highest probability
    predicted_sentiment = max(class_probabilities, key=class_probabilities.get)
    
    return predicted_sentiment

In [32]:
# empty list to store predictions
predictions = []

# Iterate over each text in the test dataset
for index, row in test.iterrows():
    text = row['text']
    # Predict sentiment for each text 
    predicted_sentiment = predict_sentiment(text, prior_distribution_train, likelihood_train)
    predictions.append(predicted_sentiment)

# Add predictions to the test DataFrame
test['predicted_sentiment'] = predictions

In [33]:
# Print the test DataFrame with predictions
print(test.head())

                                                text    label  \
0  japans ntt forecasts profits fall tokyo march ...     earn   
1  jack winter inc jwi th qtr jan loss milwaukee ...     earn   
2  honeywell bull sees revenue growth new york ma...     earn   
3  iomega iomg sets management labor layoffs roy ...     earn   
4  usda comments export sales washington march co...  soybean   

  predicted_sentiment  
0                earn  
1                earn  
2                 tea  
3                 tea  
4                 tea  


In [34]:
# write the test DataFrame with predictions to a CSV file
test.to_csv('test_with_predictions.csv', index=False)

In [35]:
# Sort the index of the DataFrame
test.sort_index(inplace=True)

In [36]:
# empty list to store predictions
predictions = []

# Iterate over each text in the test dataset
for index, row in test.iterrows():
    text = row['text']
    # Predict sentiment for each text using the predict_sentiment function
    predicted_sentiment = predict_sentiment(text, prior_distribution_train, likelihood_train)
    predictions.append(predicted_sentiment)
    # Print the actual and predicted labels for each entry
    print("Actual Label:", row['label'])
    print("Predicted Label:", predicted_sentiment)
    print()  # Add a newline 

Actual Label: earn
Predicted Label: earn

Actual Label: earn
Predicted Label: earn

Actual Label: earn
Predicted Label: tea

Actual Label: earn
Predicted Label: tea

Actual Label: soybean
Predicted Label: tea

Actual Label: earn
Predicted Label: tea

Actual Label: unknown
Predicted Label: unknown

Actual Label: grain
Predicted Label: grain

Actual Label: earn
Predicted Label: earn

Actual Label: unknown
Predicted Label: tea

Actual Label: crude
Predicted Label: crude

Actual Label: ship
Predicted Label: crude

Actual Label: unknown
Predicted Label: unknown

Actual Label: earn
Predicted Label: earn

Actual Label: money-fx
Predicted Label: money-fx

Actual Label: unknown
Predicted Label: tea

Actual Label: earn
Predicted Label: earn

Actual Label: bop
Predicted Label: tea

Actual Label: oilseed
Predicted Label: tea

Actual Label: interest
Predicted Label: unknown

Actual Label: livestock
Predicted Label: unknown

Actual Label: grain
Predicted Label: ship

Actual Label: unknown
Predicted 

Actual Label: acq
Predicted Label: acq

Actual Label: acq
Predicted Label: tea

Actual Label: unknown
Predicted Label: unknown

Actual Label: unknown
Predicted Label: unknown

Actual Label: unknown
Predicted Label: tea

Actual Label: unknown
Predicted Label: unknown

Actual Label: grain
Predicted Label: grain

Actual Label: unknown
Predicted Label: unknown

Actual Label: earn
Predicted Label: earn

Actual Label: earn
Predicted Label: earn

Actual Label: unknown
Predicted Label: unknown

Actual Label: earn
Predicted Label: unknown

Actual Label: money-supply
Predicted Label: unknown

Actual Label: acq
Predicted Label: acq

Actual Label: money-fx
Predicted Label: money-fx

Actual Label: earn
Predicted Label: earn

Actual Label: meal-feed
Predicted Label: grain

Actual Label: dlr
Predicted Label: money-fx

Actual Label: unknown
Predicted Label: unknown

Actual Label: money-supply
Predicted Label: unknown

Actual Label: acq
Predicted Label: acq

Actual Label: grain
Predicted Label: grain



Actual Label: earn
Predicted Label: acq

Actual Label: carcass
Predicted Label: tea

Actual Label: interest
Predicted Label: unknown

Actual Label: acq
Predicted Label: earn

Actual Label: earn
Predicted Label: tea

Actual Label: unknown
Predicted Label: earn

Actual Label: unknown
Predicted Label: unknown

Actual Label: unknown
Predicted Label: acq

Actual Label: earn
Predicted Label: earn

Actual Label: earn
Predicted Label: earn

Actual Label: earn
Predicted Label: earn

Actual Label: earn
Predicted Label: earn

Actual Label: earn
Predicted Label: earn

Actual Label: interest
Predicted Label: unknown

Actual Label: unknown
Predicted Label: acq

Actual Label: acq
Predicted Label: acq

Actual Label: acq
Predicted Label: acq

Actual Label: trade
Predicted Label: trade

Actual Label: acq
Predicted Label: acq

Actual Label: unknown
Predicted Label: unknown

Actual Label: earn
Predicted Label: earn

Actual Label: rice
Predicted Label: tea

Actual Label: crude
Predicted Label: tea

Actual 

Actual Label: earn
Predicted Label: earn

Actual Label: grain
Predicted Label: grain

Actual Label: acq
Predicted Label: acq

Actual Label: corn
Predicted Label: grain

Actual Label: cotton
Predicted Label: unknown

Actual Label: unknown
Predicted Label: unknown

Actual Label: earn
Predicted Label: earn

Actual Label: acq
Predicted Label: acq

Actual Label: ship
Predicted Label: ship

Actual Label: palm-oil
Predicted Label: crude

Actual Label: acq
Predicted Label: acq

Actual Label: interest
Predicted Label: unknown

Actual Label: unknown
Predicted Label: unknown

Actual Label: earn
Predicted Label: earn

Actual Label: nat-gas
Predicted Label: tea

Actual Label: earn
Predicted Label: earn

Actual Label: acq
Predicted Label: acq

Actual Label: ship
Predicted Label: tea

Actual Label: earn
Predicted Label: earn

Actual Label: acq
Predicted Label: acq

Actual Label: trade
Predicted Label: unknown

Actual Label: earn
Predicted Label: earn

Actual Label: veg-oil
Predicted Label: trade

Act

Actual Label: acq
Predicted Label: acq

Actual Label: acq
Predicted Label: tea

Actual Label: gnp
Predicted Label: tea

Actual Label: acq
Predicted Label: acq

Actual Label: ship
Predicted Label: tea

Actual Label: earn
Predicted Label: earn

Actual Label: reserves
Predicted Label: unknown

Actual Label: acq
Predicted Label: acq

Actual Label: corn
Predicted Label: grain

Actual Label: earn
Predicted Label: earn

Actual Label: crude
Predicted Label: tea

Actual Label: money-fx
Predicted Label: money-fx

Actual Label: money-fx
Predicted Label: unknown

Actual Label: unknown
Predicted Label: unknown

Actual Label: acq
Predicted Label: tea

Actual Label: money-supply
Predicted Label: unknown

Actual Label: trade
Predicted Label: tea

Actual Label: interest
Predicted Label: unknown

Actual Label: coffee
Predicted Label: tea

Actual Label: earn
Predicted Label: earn

Actual Label: earn
Predicted Label: earn

Actual Label: unknown
Predicted Label: unknown

Actual Label: corn
Predicted Label:

Actual Label: crude
Predicted Label: tea

Actual Label: acq
Predicted Label: acq

Actual Label: unknown
Predicted Label: unknown

Actual Label: earn
Predicted Label: earn

Actual Label: ipi
Predicted Label: tea

Actual Label: unknown
Predicted Label: unknown

Actual Label: unknown
Predicted Label: unknown

Actual Label: earn
Predicted Label: earn

Actual Label: ship
Predicted Label: unknown

Actual Label: corn
Predicted Label: grain

Actual Label: acq
Predicted Label: acq

Actual Label: ship
Predicted Label: ship

Actual Label: pet-chem
Predicted Label: acq

Actual Label: earn
Predicted Label: earn

Actual Label: earn
Predicted Label: earn

Actual Label: money-supply
Predicted Label: earn

Actual Label: unknown
Predicted Label: tea

Actual Label: wheat
Predicted Label: tea

Actual Label: earn
Predicted Label: earn

Actual Label: earn
Predicted Label: earn

Actual Label: dlr
Predicted Label: money-fx

Actual Label: money-fx
Predicted Label: unknown

Actual Label: earn
Predicted Label: e

Actual Label: earn
Predicted Label: earn

Actual Label: acq
Predicted Label: unknown

Actual Label: grain
Predicted Label: tea

Actual Label: acq
Predicted Label: acq

Actual Label: acq
Predicted Label: acq

Actual Label: crude
Predicted Label: acq

Actual Label: acq
Predicted Label: acq

Actual Label: acq
Predicted Label: acq

Actual Label: money-supply
Predicted Label: unknown

Actual Label: earn
Predicted Label: earn

Actual Label: corn
Predicted Label: tea

Actual Label: acq
Predicted Label: acq

Actual Label: unknown
Predicted Label: unknown

Actual Label: earn
Predicted Label: earn

Actual Label: acq
Predicted Label: acq

Actual Label: crude
Predicted Label: earn

Actual Label: crude
Predicted Label: crude

Actual Label: acq
Predicted Label: acq

Actual Label: fuel
Predicted Label: crude

Actual Label: acq
Predicted Label: acq

Actual Label: ship
Predicted Label: unknown

Actual Label: acq
Predicted Label: acq

Actual Label: trade
Predicted Label: trade

Actual Label: unknown
Pre

Actual Label: corn
Predicted Label: unknown

Actual Label: zinc
Predicted Label: earn

Actual Label: acq
Predicted Label: acq

Actual Label: trade
Predicted Label: unknown

Actual Label: earn
Predicted Label: earn

Actual Label: interest
Predicted Label: unknown

Actual Label: earn
Predicted Label: earn

Actual Label: iron-steel
Predicted Label: tea

Actual Label: unknown
Predicted Label: unknown

Actual Label: cocoa
Predicted Label: tea

Actual Label: gnp
Predicted Label: tea

Actual Label: gnp
Predicted Label: tea

Actual Label: acq
Predicted Label: tea

Actual Label: money-fx
Predicted Label: money-fx

Actual Label: cpi
Predicted Label: unknown

Actual Label: money-supply
Predicted Label: unknown

Actual Label: livestock
Predicted Label: unknown

Actual Label: crude
Predicted Label: acq

Actual Label: grain
Predicted Label: tea

Actual Label: trade
Predicted Label: unknown

Actual Label: earn
Predicted Label: earn

Actual Label: ipi
Predicted Label: tea

Actual Label: wheat
Predicte

Actual Label: earn
Predicted Label: earn

Actual Label: acq
Predicted Label: tea

Actual Label: unknown
Predicted Label: unknown

Actual Label: earn
Predicted Label: earn

Actual Label: acq
Predicted Label: acq

Actual Label: coffee
Predicted Label: unknown

Actual Label: grain
Predicted Label: grain

Actual Label: crude
Predicted Label: tea

Actual Label: unknown
Predicted Label: unknown

Actual Label: unknown
Predicted Label: unknown

Actual Label: unknown
Predicted Label: acq

Actual Label: earn
Predicted Label: earn

Actual Label: earn
Predicted Label: earn

Actual Label: crude
Predicted Label: tea

Actual Label: acq
Predicted Label: acq

Actual Label: cpi
Predicted Label: unknown

Actual Label: earn
Predicted Label: earn

Actual Label: unknown
Predicted Label: tea

Actual Label: corn
Predicted Label: grain

Actual Label: earn
Predicted Label: earn

Actual Label: acq
Predicted Label: acq

Actual Label: unknown
Predicted Label: unknown

Actual Label: unknown
Predicted Label: unknown

Actual Label: acq
Predicted Label: acq

Actual Label: earn
Predicted Label: tea

Actual Label: coffee
Predicted Label: unknown

Actual Label: earn
Predicted Label: earn

Actual Label: acq
Predicted Label: acq

Actual Label: money-fx
Predicted Label: money-fx

Actual Label: reserves
Predicted Label: tea

Actual Label: crude
Predicted Label: acq

Actual Label: earn
Predicted Label: earn

Actual Label: livestock
Predicted Label: trade

Actual Label: unknown
Predicted Label: unknown

Actual Label: unknown
Predicted Label: unknown

Actual Label: trade
Predicted Label: tea

Actual Label: unknown
Predicted Label: tea

Actual Label: unknown
Predicted Label: earn

Actual Label: unknown
Predicted Label: tea

Actual Label: crude
Predicted Label: crude

Actual Label: earn
Predicted Label: earn

Actual Label: acq
Predicted Label: acq

Actual Label: cpi
Predicted Label: unknown

Actual Label: earn
Predicted Label: earn

Actual Label: unknown
Predicted Label: unknown

Actual Label: money-fx
Predicted

Actual Label: acq
Predicted Label: acq

Actual Label: trade
Predicted Label: trade

Actual Label: earn
Predicted Label: earn

Actual Label: unknown
Predicted Label: unknown

Actual Label: cpi
Predicted Label: unknown

Actual Label: earn
Predicted Label: earn

Actual Label: acq
Predicted Label: acq

Actual Label: acq
Predicted Label: acq

Actual Label: interest
Predicted Label: unknown

Actual Label: bop
Predicted Label: tea

Actual Label: ship
Predicted Label: unknown

Actual Label: interest
Predicted Label: unknown

Actual Label: acq
Predicted Label: tea

Actual Label: ship
Predicted Label: unknown

Actual Label: sugar
Predicted Label: ship

Actual Label: grain
Predicted Label: grain

Actual Label: earn
Predicted Label: earn

Actual Label: money-supply
Predicted Label: unknown

Actual Label: acq
Predicted Label: acq

Actual Label: unknown
Predicted Label: unknown

Actual Label: earn
Predicted Label: tea

Actual Label: soybean
Predicted Label: tea

Actual Label: crude
Predicted Label: 

Actual Label: crude
Predicted Label: tea

Actual Label: unknown
Predicted Label: unknown

Actual Label: earn
Predicted Label: earn

Actual Label: unknown
Predicted Label: unknown

Actual Label: earn
Predicted Label: earn

Actual Label: interest
Predicted Label: unknown

Actual Label: corn
Predicted Label: grain

Actual Label: trade
Predicted Label: trade

Actual Label: grain
Predicted Label: grain

Actual Label: cocoa
Predicted Label: cocoa

Actual Label: earn
Predicted Label: earn

Actual Label: acq
Predicted Label: tea

Actual Label: earn
Predicted Label: earn

Actual Label: dlr
Predicted Label: money-fx

Actual Label: ship
Predicted Label: tea

Actual Label: alum
Predicted Label: unknown

Actual Label: earn
Predicted Label: earn

Actual Label: acq
Predicted Label: acq

Actual Label: crude
Predicted Label: tea

Actual Label: earn
Predicted Label: earn

Actual Label: unknown
Predicted Label: unknown

Actual Label: gold
Predicted Label: tea

Actual Label: interest
Predicted Label: unkn

In [37]:
# Add predictions to the test DataFrame
test['predicted_sentiment'] = predictions

# write the test DataFrame with predictions to a CSV file
test.to_csv('test_with_predictions.csv', index=False)

In [90]:
def evaluate_model(true_labels, predicted_labels):
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, average='weighted')
    recall = recall_score(true_labels, predicted_labels, average='weighted')
    f1 = f1_score(true_labels, predicted_labels, average='weighted')
    
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)

In [91]:
evaluate_model(test['label'], test['predicted_sentiment'])

Accuracy: 0.524352786309785
Precision: 0.5501705588605198
Recall: 0.524352786309785
F1 Score: 0.509301644596317


  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
report = classification_report(test['label'], test['predicted_sentiment'], output_dict=True)

# Calculate macro-averaged F1-score
macro_avg_f1_score = report['macro avg']['f1-score']

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [41]:
print("Macro-averaged F1-score:", macro_avg_f1_score)

Macro-averaged F1-score: 0.06323003623193678


In [43]:
#  true labels 'y_true' and predicted labels 'y_pred'
micro_avg_f1_score = f1_score(test['label'], test['predicted_sentiment'], average='micro')
print("Micro-averaged F1-score:", micro_avg_f1_score)

Micro-averaged F1-score: 0.524352786309785


In [44]:
# next   part1   
# tf-idf          
#naive_bayes
#pipeline

In [45]:
# Define the model pipeline
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

In [46]:
# Train the model
model.fit(train['text'], train['label'])

In [47]:
# Predictions
predictions = model.predict(test['text'])

In [48]:
# Evaluation
accuracy = accuracy_score(test['label'], predictions)
precision = precision_score(test['label'], predictions, average='weighted')
recall = recall_score(test['label'], predictions, average='weighted')
f1 = f1_score(test['label'], predictions, average='weighted')

  _warn_prf(average, modifier, msg_start, len(result))


In [49]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.5870996050899517
Precision: 0.4566978017008637
Recall: 0.5870996050899517
F1 Score: 0.4933873384100479


In [50]:
# Evaluation
f1 = f1_score(test['label'], predictions, average='macro')

In [51]:
print("Macro-averaged F1 Score:", f1)

Macro-averaged F1 Score: 0.053880713601106085


In [52]:
#next_one
#svm
#tf-idf

In [53]:
# Convert text data into TF-IDF vectors
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(train['text'])
X_test_tfidf = tfidf_vectorizer.transform(test['text'])

# Train SVM classifier
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_tfidf, train['label'])

# Predict labels for test data
predicted_labels = svm_classifier.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(test['label'], predicted_labels)
precision = precision_score(test['label'], predicted_labels, average='weighted')
recall = recall_score(test['label'], predicted_labels, average='weighted')
f1 = f1_score(test['label'], predicted_labels, average='weighted')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [54]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.7376042123738482
Precision: 0.7184833203889918
Recall: 0.7376042123738482
F1 Score: 0.7223878903719414


In [79]:
# Calculate macro-averaged F1-score
macro_f1 = f1_score(test['label'], predicted_labels, average='macro')

print("Macro-averaged F1 Score:", macro_f1)

Macro-averaged F1 Score: 0.19544226498075126


In [77]:
# Calculate micro-averaged F1-score
micro_f1 = f1_score(test['label'], predicted_labels, average='micro')

print("Micro-averaged F1 Score:", micro_f1)

Micro-averaged F1 Score: 0.6731022378236069


In [72]:
# Here I need to make more features to try to increase accuracy

In [58]:
# Define TF-IDF vectorizer with unigrams and bigrams
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))

# Convert text data into TF-IDF vectors
X_train_tfidf = tfidf_vectorizer.fit_transform(train['text'])
X_test_tfidf = tfidf_vectorizer.transform(test['text'])

# Train SVM classifier
svm_classifier = SVC()
svm_classifier.fit(X_train_tfidf, train['label'])

# Predict labels for test data
predicted_labels = svm_classifier.predict(X_test_tfidf)

# Evaluate the model
print(classification_report(test['label'], predicted_labels))

                 precision    recall  f1-score   support

            acq       0.73      0.95      0.82       326
           alum       0.00      0.00      0.00         4
         barley       0.00      0.00      0.00         2
            bop       0.07      0.07      0.07        14
        carcass       0.00      0.00      0.00         7
          cocoa       1.00      0.70      0.82        20
        coconut       0.00      0.00      0.00         1
         coffee       0.90      0.82      0.86        22
         copper       0.20      0.20      0.20        10
     copra-cake       0.00      0.00      0.00         1
           corn       0.00      0.00      0.00        38
         cotton       0.00      0.00      0.00         6
            cpi       0.40      0.60      0.48        10
            cpu       0.00      0.00      0.00         1
          crude       0.63      0.63      0.63        81
            dlr       0.00      0.00      0.00        25
            dmk       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [60]:
# next
#Embedding, LSTM, Dense,  LabelEncoder(preprocessing)

In [61]:
# Prepare data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)  
X = tokenizer.texts_to_sequences(data)
X = pad_sequences(X)  

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [62]:
# Define model 
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=X.shape[1]))
model.add(LSTM(100))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))




In [63]:
# Compile model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))


Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x20d23142620>

In [64]:
# Evaluate model
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.6713470816612244


In [65]:
# predictions 
y_pred_probs = model.predict(X_test)
y_pred = y_pred_probs.argmax(axis=1)  # the class --> with the highest probability as the predicted class



In [85]:
# Calculate precision, recall, and F1-score for each class
precision_scores = precision_score(y_test, y_pred, average='macro')
recall_scores = recall_score(y_test, y_pred, average='macro')
f1_scores = f1_score(y_test, y_pred, average='macro')

  _warn_prf(average, modifier, msg_start, len(result))


In [86]:
# Calculate macro-averaged F1-score
macro_f1_score = f1_score(y_test, y_pred, average='macro')

# Calculate micro-averaged F1-score
micro_f1_score = f1_score(y_test, y_pred, average='micro')

In [87]:
print("Precision Scores:", precision_scores)
print("Recall Scores:", recall_scores)
print("F1 Scores:", f1_scores)
print("Macro-averaged F1 Score:", macro_f1_score)
print("Micro-averaged F1 Score:", micro_f1_score)

Precision Scores: 0.16423008752247395
Recall Scores: 0.16401226895767337
F1 Scores: 0.1512556635539115
Macro-averaged F1 Score: 0.1512556635539115
Micro-averaged F1 Score: 0.6713470820535322


In [69]:
#version 3