In [2]:
import re as re
import heapq as heapq
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import random as random
from sklearn.preprocessing import StandardScaler
import pyarrow as pa
import pyarrow.parquet as pq
from sklearn.decomposition import PCA
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics 
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

In [2]:
tweets = pd.read_csv('CleanedTweets.csv')
tweets['processed_text'] = tweets['processed_text'].fillna('')

In [3]:
tweets.head()

Unnamed: 0.1,Unnamed: 0,sentiment,text,mentions,hashtags,urls,exclamations,emoticons,ellipsis,word_count,processed_text
0,0,0,"- awww, that's a bummer. you shoulda got da...",1,0,1,0,1,0,19,"- awww , bummer . shoulda got david carr third..."
1,1,0,is upset that he can't update his facebook by ...,0,0,0,1,0,1,21,is upset can't update facebook texting ... mig...
2,2,0,i dived many times for the ball. managed to s...,1,0,0,0,0,0,18,dived many time ball . managed save 50 % rest ...
3,3,0,my whole body feels itchy and like its on fire,0,0,0,0,0,0,10,whole body feel itchy like fire
4,4,0,"no, it's not behaving at all. i'm mad. why am...",1,0,0,0,0,0,21,"no , not behaving . mad . ? can't see ."


## Encodings

### Bag of Words Encoding

In [4]:
def encode_bow(X_train, X_test, ngram_range=(1, 2), max_features=5000):
    vectorizer = CountVectorizer(
        max_features=max_features,
        ngram_range=ngram_range
    )
    
    X_train_bow = vectorizer.fit_transform(X_train['processed_text'])
    X_test_bow = vectorizer.transform(X_test['processed_text'])
    
    return X_train_bow, X_test_bow, vectorizer

### TF-IDF Encoding

In [5]:
def encode_tfidf(X_train, X_test, ngram_range=(1, 2), max_features=5000):
    vectorizer = TfidfVectorizer(
        max_features=max_features,
        ngram_range=ngram_range
    )
    
    X_train_tfidf = vectorizer.fit_transform(X_train['processed_text'])
    X_test_tfidf = vectorizer.transform(X_test['processed_text'])

    return X_train_tfidf, X_test_tfidf, vectorizer

### VADERS Encoding

In [6]:
nltk.download('vader_lexicon', quiet=True)

def encode_vader(X_train, X_test):
    vader = SentimentIntensityAnalyzer()
    
    def extract_vader_scores(texts):
        compound_scores = []
        positive_scores = []
        negative_scores = []
        neutral_scores = []
        
        for text in texts:
            scores = vader.polarity_scores(str(text))
            compound_scores.append(scores['compound'])
            positive_scores.append(scores['pos'])
            negative_scores.append(scores['neg'])
            neutral_scores.append(scores['neu'])
        
        return pd.DataFrame({
            'vader_compound': compound_scores,
            'vader_positive': positive_scores,
            'vader_negative': negative_scores,
            'vader_neutral': neutral_scores
        })
    
    X_train_vader = extract_vader_scores(X_train['text'])
    X_test_vader = extract_vader_scores(X_test['text'])
    
    return X_train_vader, X_test_vader

### Split DataSet

In [7]:
X = tweets.drop(columns = ['sentiment'])
y = tweets['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Create Cleaned DataSet and encoding them 

In [9]:
def create_fold_datasets(X_fold_train, X_fold_val):
    # Get original features
    feature_cols = ['mentions', 'hashtags', 'urls', 'exclamations', 'emoticons', 'ellipsis']
    X_fold_train_features = X_fold_train[feature_cols].copy()
    X_fold_val_features = X_fold_val[feature_cols].copy()
    
    # Create datasets dictionary
    fold_datasets = {}
    
    # 1. Original features only
    fold_datasets['original'] = (X_fold_train_features, X_fold_val_features)
    
    # 2. BOW features only (Unigram and Bigrams)
    bow_vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 2))
    X_fold_train_bow = bow_vectorizer.fit_transform(X_fold_train['processed_text'])
    X_fold_val_bow = bow_vectorizer.transform(X_fold_val['processed_text'])
    
    # Convert to DataFrame
    bow_train_df = pd.DataFrame(
        X_fold_train_bow.toarray(),
        columns=[f'bow_{i}' for i in range(X_fold_train_bow.shape[1])]
    )
    bow_val_df = pd.DataFrame(
        X_fold_val_bow.toarray(),
        columns=[f'bow_{i}' for i in range(X_fold_train_bow.shape[1])]
    )
    
    fold_datasets['bow_only'] = (bow_train_df, bow_val_df)
    
    # 3. TF-IDF features only (Unigram and Bigrams)
    tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
    X_fold_train_tfidf = tfidf_vectorizer.fit_transform(X_fold_train['processed_text'])
    X_fold_val_tfidf = tfidf_vectorizer.transform(X_fold_val['processed_text'])
    
    # Convert to DataFrame
    tfidf_train_df = pd.DataFrame(
        X_fold_train_tfidf.toarray(),
        columns=[f'tfidf_{i}' for i in range(X_fold_train_tfidf.shape[1])]
    )
    tfidf_val_df = pd.DataFrame(
        X_fold_val_tfidf.toarray(),
        columns=[f'tfidf_{i}' for i in range(X_fold_train_tfidf.shape[1])]
    )
    
    fold_datasets['tfidf_only'] = (tfidf_train_df, tfidf_val_df)
    
    # 4. VADER features only
    vader = SentimentIntensityAnalyzer()
    
    def get_vader_scores(texts):
        scores_df = pd.DataFrame()
        scores_df['vader_compound'] = [vader.polarity_scores(str(t))['compound'] for t in texts]
        scores_df['vader_pos'] = [vader.polarity_scores(str(t))['pos'] for t in texts]
        scores_df['vader_neg'] = [vader.polarity_scores(str(t))['neg'] for t in texts]
        scores_df['vader_neu'] = [vader.polarity_scores(str(t))['neu'] for t in texts]
        return scores_df
    
    X_fold_train_vader = get_vader_scores(X_fold_train['text'])
    X_fold_val_vader = get_vader_scores(X_fold_val['text'])
    
    fold_datasets['vader_only'] = (X_fold_train_vader, X_fold_val_vader)
    
    # 5. Combined datasets
    # BOW + original
    X_fold_train_bow_combined = pd.concat([X_fold_train_features.reset_index(drop=True), 
                                         bow_train_df.reset_index(drop=True)], axis=1)
    X_fold_val_bow_combined = pd.concat([X_fold_val_features.reset_index(drop=True), 
                                       bow_val_df.reset_index(drop=True)], axis=1)
    
    fold_datasets['bow_combined'] = (X_fold_train_bow_combined, X_fold_val_bow_combined)
    
    # TF-IDF + original
    X_fold_train_tfidf_combined = pd.concat([X_fold_train_features.reset_index(drop=True), 
                                           tfidf_train_df.reset_index(drop=True)], axis=1)
    X_fold_val_tfidf_combined = pd.concat([X_fold_val_features.reset_index(drop=True), 
                                         tfidf_val_df.reset_index(drop=True)], axis=1)
    
    fold_datasets['tfidf_combined'] = (X_fold_train_tfidf_combined, X_fold_val_tfidf_combined)
    
    # VADER + original
    X_fold_train_vader_combined = pd.concat([X_fold_train_features.reset_index(drop=True), 
                                           X_fold_train_vader.reset_index(drop=True)], axis=1)
    X_fold_val_vader_combined = pd.concat([X_fold_val_features.reset_index(drop=True), 
                                         X_fold_val_vader.reset_index(drop=True)], axis=1)
    
    fold_datasets['vader_combined'] = (X_fold_train_vader_combined, X_fold_val_vader_combined)
    
    

    # Return all datasets for this fold
    return fold_datasets

### Random Forest

In [13]:
def train_random_forest_with_kfold(X, y, n_folds=5, n_estimator = 100):
    # Dictionary to store all results
    all_results = {}
    
    # Create KFold object
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    
    # List of datasets to evaluate
    dataset_names = [
        'original',          # Just original features
        'bow_only',          # Just BOW features
        'tfidf_only',        # Just TF-IDF features
        'vader_only',        # Just VADER features
        'bow_combined',      # BOW + original features
        'tfidf_combined',    # TF-IDF + original features
        'vader_combined'     # VADER + original features
    ]

    for name in dataset_names:
        all_results[name] = {
            'dataset': name,
            'acc_scores': [],
            'all_predictions': np.array([]),
            'all_true': np.array([])
        }
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        print(f"\nProcessing fold {fold+1}/{n_folds}...")
        
        # Split the data
        X_fold_train, X_fold_val = X.iloc[train_idx], X.iloc[val_idx]
        y_fold_train, y_fold_val = y.iloc[train_idx], y.iloc[val_idx]
        
        # Generate DataSet for each fold to prevent leakage between train and test
        fold_datasets = create_fold_datasets(X_fold_train, X_fold_val)

        # Process each dataset type
        for dataset_name in dataset_names:
            print(f"  Training on {dataset_name}...")

            X_fold_train_data, X_fold_val_data = fold_datasets[dataset_name]
            
            # Train Random Forest on this fold and dataset 
            # min_samples_leaf and min_samples_split higher than default to prevent overfitting and to decrease required computation
            rf = RandomForestClassifier(n_estimators=n_estimator, 
                                        random_state=42,
                                        min_samples_leaf=10,
                                        min_samples_split=20)
            
            rf.fit(X_fold_train_data, y_fold_train)

            
            # Make predictions on validation fold
            fold_preds = rf.predict(X_fold_val_data)
            fold_score = accuracy_score(y_fold_val, fold_preds)
            
            # Store results for this dataset type
            all_results[dataset_name]['acc_scores'].append(fold_score)
            all_results[dataset_name]['all_predictions'] = np.append(
                all_results[dataset_name]['all_predictions'], fold_preds
            )
            all_results[dataset_name]['all_true'] = np.append(
                all_results[dataset_name]['all_true'], y_fold_val
            )
            
            # Feature importance
            if hasattr(rf, 'feature_importances_') and fold == 0:
                importances = rf.feature_importances_
                indices = np.argsort(importances)[::-1]
                features = X_fold_train_data.columns
                
                # Store top features
                top_features = []
                for i in range(min(10, len(features))):
                    feature_idx = indices[i]
                    top_features.append((features[feature_idx], importances[feature_idx]))
                
                all_results[dataset_name]['top_features'] = top_features
                all_results[dataset_name]['feature_importances'] = importances
                all_results[dataset_name]['features'] = features
            

    
    # Calculate final metrics for each dataset type
    for name in dataset_names:
        # Calculate mean
        all_results[name]['mean_accuracy'] = np.mean(all_results[name]['acc_scores'])
        all_results[name]['std_accuracy'] = np.std(all_results[name]['acc_scores'])
        
        # Calculate confusion matrix
        all_results[name]['confusion_matrix'] = confusion_matrix(
            all_results[name]['all_true'], all_results[name]['all_predictions']
        )
        
        # Print results
        print(f"\n--- Results for {name} ---")
        print(f"accuracies: {all_results[name]['acc_scores']}")
        print(f"Mean 5-fold accuracy: {all_results[name]['mean_accuracy']:.4f}")
        
        # Print classification report
        print("\nClassification Report:")
        print(classification_report(all_results[name]['all_true'], all_results[name]['all_predictions']))
        
        # Print top features
        if 'top_features' in all_results[name]:
            print("\nTop 10 features from first fold:")
            for i, (feature, importance) in enumerate(all_results[name]['top_features'], 1):
                print(f"{i}. {feature}: {importance:.4f}")
    
    return all_results

In [10]:
# Train RandomForest with 100 estimators

n_folds = 5
n_estimator = 100
results = train_random_forest_with_kfold(X_train, y_train, n_folds, n_estimator = n_estimator)



Processing fold 1/5...
  Training on original...
  Training on bow_only...
  Training on tfidf_only...
  Training on vader_only...
  Training on bow_combined...
  Training on tfidf_combined...
  Training on vader_combined...

Processing fold 2/5...
  Training on original...
  Training on bow_only...
  Training on tfidf_only...
  Training on vader_only...
  Training on bow_combined...
  Training on tfidf_combined...
  Training on vader_combined...

Processing fold 3/5...
  Training on original...
  Training on bow_only...
  Training on tfidf_only...
  Training on vader_only...
  Training on bow_combined...
  Training on tfidf_combined...
  Training on vader_combined...

Processing fold 4/5...
  Training on original...
  Training on bow_only...
  Training on tfidf_only...
  Training on vader_only...
  Training on bow_combined...
  Training on tfidf_combined...
  Training on vader_combined...

Processing fold 5/5...
  Training on original...
  Training on bow_only...
  Training on tfidf_

In [11]:
# Print summary of all results
print("\n=== SUMMARY OF RESULTS ===")
for dataset_name in sorted(results.keys(), key=lambda k: results[k]['mean_accuracy'], reverse=True):
    result = results[dataset_name]
    print(f"{dataset_name}: Accuracy = {result['mean_accuracy']:.4f} ± {result['std_accuracy']:.4f}, ")

# Identify best dataset
best_dataset = max(results.keys(), key=lambda k: results[k]['mean_accuracy'])
print(f"\nBest dataset: {best_dataset} with accuracy {results[best_dataset]['mean_accuracy']:.4f} ± " +
        f"{results[best_dataset]['std_accuracy']:.4f}")


=== SUMMARY OF RESULTS ===
tfidf_combined: Accuracy = 0.7495 ± 0.0039, 
tfidf_only: Accuracy = 0.7478 ± 0.0045, 
bow_combined: Accuracy = 0.7475 ± 0.0026, 
bow_only: Accuracy = 0.7450 ± 0.0042, 
vader_combined: Accuracy = 0.7040 ± 0.0032, 
vader_only: Accuracy = 0.6736 ± 0.0022, 
original: Accuracy = 0.6110 ± 0.0022, 

Best dataset: tfidf_combined with accuracy 0.7495 ± 0.0039


TF-IDF would be chosen as the encoding to train with 

### PCA Optimisation

In [None]:
# Add missing imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score

# 1. Load and clean data
cleanedtweets = pd.read_csv("CleanedTweets.csv")
cleanedtweets["processed_text"] = cleanedtweets["processed_text"].fillna("")

X_text = cleanedtweets['processed_text']
y = cleanedtweets['sentiment']

# 2. Split text before vectorization
X_train_text, X_test_text, y_train, y_test = train_test_split(X_text, y, test_size=0.2, random_state=100)

# 3. TF-IDF vectorization
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train_text)
X_test_tfidf = tfidf.transform(X_test_text)

# 4. PCA + KNN evaluation loop
model_results = []
optimal_k_list = []

for dim in range(25, 250, 25):
    print(f"Evaluating PCA dimension: {dim}")

    # Convert sparse TF-IDF to dense matrix for PCA
    X_train_dense = X_train_tfidf.toarray()
    X_test_dense = X_test_tfidf.toarray()

    pca = PCA(n_components=dim)
    X_train_pca = pca.fit_transform(X_train_dense)
    X_test_pca = pca.transform(X_test_dense)

    # Train/Val split for KNN tuning
    X_train_cv, X_val_cv, y_train_cv, y_val_cv = train_test_split(X_train_pca, y_train, test_size=0.2, random_state=42)

    # Try odd k values
    k_values = list(range(1, 50, 2))
    scores = []

    for k in k_values:
        knn = KNeighborsClassifier(n_neighbors=k)
        score = cross_val_score(knn, X_train_cv, y_train_cv, cv=5)
        scores.append(np.mean(score))

    # Elbow detection
    score_differences = np.diff(scores)
    threshold = 0.001
    elbow_idx = np.where(score_differences < threshold)[0]

    if len(elbow_idx) == 0:
        optimal_k = k_values[np.argmax(scores)]  # Fallback
    else:
        optimal_k = k_values[elbow_idx[0]]

    optimal_k_list.append(optimal_k)
    # Final training with optimal k
    knn_pca = KNeighborsClassifier(n_neighbors=optimal_k)
    knn_pca.fit(X_train_cv, y_train_cv)
    y_pred_pca = knn_pca.predict(X_test_pca)

    # Example metrics calculation (replace with your custom function)
    acc = accuracy_score(y_test, y_pred_pca)
    f1 = f1_score(y_test, y_pred_pca, average="weighted")
    mi = pd.DataFrame({"PCA_Dim": [dim], "Accuracy": [acc], "F1_Score": [f1]})

    model_results.append(mi)

final_results = pd.concat(model_results).reset_index(drop=True)
print(final_results)




Evaluating PCA dimension: 25
Evaluating PCA dimension: 50
Evaluating PCA dimension: 75
Evaluating PCA dimension: 100
Evaluating PCA dimension: 125
Evaluating PCA dimension: 150
Evaluating PCA dimension: 175
Evaluating PCA dimension: 200
Evaluating PCA dimension: 225
   PCA_Dim  Accuracy  F1_Score
0       25   0.65530  0.655258
1       50   0.67735  0.677207
2       75   0.67460  0.674272
3      100   0.67480  0.674389
4      125   0.68110  0.680519
5      150   0.68660  0.685667
6      175   0.68725  0.686037
7      200   0.69155  0.690373
8      225   0.68955  0.687857


In [25]:
# 1. Load and clean data
cleanedtweets = pd.read_csv("CleanedTweets.csv")
cleanedtweets["processed_text"] = cleanedtweets["processed_text"].fillna("")

X_text = cleanedtweets['processed_text']
y = cleanedtweets['sentiment']

# 2. Split text before vectorization
X_train_text, X_test_text, y_train, y_test = train_test_split(X_text, y, test_size=0.2, random_state=100)

# 3. TF-IDF vectorization
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train_text)
X_test_tfidf = tfidf.transform(X_test_text)

# 4. PCA + KNN evaluation loop
model_results = []
optimal_k_list = []

for dim in range(200, 225, 25):
    print(f"Evaluating PCA dimension: {dim}")

    # Convert sparse TF-IDF to dense matrix for PCA
    X_train_dense = X_train_tfidf.toarray()
    X_test_dense = X_test_tfidf.toarray()

    pca = PCA(n_components=dim)
    X_train_pca = pca.fit_transform(X_train_dense)
    X_test_pca = pca.transform(X_test_dense)

    # Train/Val split for KNN tuning
    X_train_cv, X_val_cv, y_train_cv, y_val_cv = train_test_split(X_train_pca, y_train, test_size=0.2, random_state=42)

    # Try odd k values
    k_values = list(range(1, 50, 2))
    scores = []

    for k in k_values:
        knn = KNeighborsClassifier(n_neighbors=k)
        score = cross_val_score(knn, X_train_cv, y_train_cv, cv=5)
        scores.append(np.mean(score))

    # Elbow detection
    score_differences = np.diff(scores)
    threshold = 0.001
    elbow_idx = np.where(score_differences < threshold)[0]

    if len(elbow_idx) == 0:
        optimal_k = k_values[np.argmax(scores)]  # Fallback
    else:
        optimal_k = k_values[elbow_idx[0]]

    optimal_k_list.append(optimal_k)
    # Final training with optimal k
    knn_pca = KNeighborsClassifier(n_neighbors=optimal_k)
    knn_pca.fit(X_train_cv, y_train_cv)
    y_pred_pca = knn_pca.predict(X_test_pca)

    # Example metrics calculation (replace with your custom function)
    acc = accuracy_score(y_test, y_pred_pca)
    f1 = f1_score(y_test, y_pred_pca, average="weighted")
    mi = pd.DataFrame({"PCA_Dim": [dim], "Accuracy": [acc], "F1_Score": [f1]})

    model_results.append(mi)

Evaluating PCA dimension: 200


In [26]:
print(optimal_k_list)

[11]


In [15]:
print(scores)

[0.635375, 0.6565156249999999, 0.663890625, 0.6714687500000001, 0.6740156249999999, 0.678453125, 0.6799375, 0.6792499999999999, 0.681171875, 0.68259375, 0.683109375, 0.68478125, 0.6853125, 0.6862812500000001, 0.6870156250000001, 0.687125, 0.6864531250000001, 0.686890625, 0.68875, 0.68871875, 0.6881718749999999, 0.688578125, 0.6877812500000001, 0.6878593749999999, 0.6887343749999999]


PCA_Dim of 200 is chosen as it has the highest accuracy

In [20]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

cleanedtweets = pd.read_csv("CleanedTweets.csv")
cleanedtweets["processed_text"] = cleanedtweets["processed_text"].fillna("")

X_text = cleanedtweets.drop(columns = ['sentiment'])
y = cleanedtweets['sentiment']


# Models
models = {
    'Naive Bayes': GaussianNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'KNN': KNeighborsClassifier(n_neighbors=11)
}

# Initialize results storage
results = {}
for name in models:
    results[name] = {'acc': [], 'prec': [], 'rec': [], 'f1': []}

# 5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, test_idx) in enumerate(kf.split(X_text)):
    print(f"Fold {fold+1}/5")
    
    # Split data
    X_train, X_test = X_text.iloc[train_idx], X_text.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # Get encodings
    fold_datasets = create_fold_datasets(X_train, X_test)
    X_train_data, X_test_data = fold_datasets['tfidf_combined']
    
    # Apply PCA
    pca = PCA(n_components=200)
    X_train_pca = pca.fit_transform(X_train_data)
    X_test_pca = pca.transform(X_test_data)
    
    # Train and evaluate each model
    for name, model in models.items():
        # Train
        model.fit(X_train_pca, y_train)
        
        # Predict
        preds = model.predict(X_test_pca)
        
        # Calculate metrics
        results[name]['acc'].append(accuracy_score(y_test, preds))
        results[name]['prec'].append(precision_score(y_test, preds, average='weighted'))
        results[name]['rec'].append(recall_score(y_test, preds, average='weighted'))
        results[name]['f1'].append(f1_score(y_test, preds, average='weighted'))

# Print results
print("\nRESULTS:")
for name in models:
    print(f"\n{name}:")
    print(f"Accuracy:  {np.mean(results[name]['acc']):.4f}")
    print(f"Precision: {np.mean(results[name]['prec']):.4f}")
    print(f"Recall:    {np.mean(results[name]['rec']):.4f}")
    print(f"F1-Score:  {np.mean(results[name]['f1']):.4f}")


Fold 1/5
Fold 2/5
Fold 3/5
Fold 4/5
Fold 5/5

RESULTS:

Naive Bayes:
Accuracy:  0.6562
Precision: 0.6578
Recall:    0.6562
F1-Score:  0.6554

Logistic Regression:
Accuracy:  0.7306
Precision: 0.7306
Recall:    0.7306
F1-Score:  0.7306

KNN:
Accuracy:  0.6687
Precision: 0.6706
Recall:    0.6687
F1-Score:  0.6677


In [22]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

cleanedtweets = pd.read_csv("CleanedTweets.csv")
cleanedtweets["processed_text"] = cleanedtweets["processed_text"].fillna("")

X_text = cleanedtweets.drop(columns = ['sentiment'])
y = cleanedtweets['sentiment']


# Models
models = {
    'Naive Bayes': GaussianNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'KNN': KNeighborsClassifier(n_neighbors=11)
}

# Initialize results storage
results = {}
for name in models:
    results[name] = {'acc': [], 'prec': [], 'rec': [], 'f1': []}

# 5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, test_idx) in enumerate(kf.split(X_text)):
    print(f"Fold {fold+1}/5")
    
    # Split data
    X_train, X_test = X_text.iloc[train_idx], X_text.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # Get encodings
    fold_datasets = create_fold_datasets(X_train, X_test)
    X_train_data, X_test_data = fold_datasets['tfidf_combined']
    
    # Train and evaluate each model
    for name, model in models.items():
        # Train
        model.fit(X_train_data, y_train)
        
        # Predict
        preds = model.predict(X_test_data)
        
        # Calculate metrics
        results[name]['acc'].append(accuracy_score(y_test, preds))
        results[name]['prec'].append(precision_score(y_test, preds, average='weighted'))
        results[name]['rec'].append(recall_score(y_test, preds, average='weighted'))
        results[name]['f1'].append(f1_score(y_test, preds, average='weighted'))

# Print results
print("\nRESULTS:")
for name in models:
    print(f"\n{name}:")
    print(f"Accuracy:  {np.mean(results[name]['acc']):.4f}")
    print(f"Precision: {np.mean(results[name]['prec']):.4f}")
    print(f"Recall:    {np.mean(results[name]['rec']):.4f}")
    print(f"F1-Score:  {np.mean(results[name]['f1']):.4f}")


Fold 1/5
Fold 2/5
Fold 3/5
Fold 4/5
Fold 5/5

RESULTS:

Naive Bayes:
Accuracy:  0.7182
Precision: 0.7207
Recall:    0.7182
F1-Score:  0.7173

Logistic Regression:
Accuracy:  0.7793
Precision: 0.7795
Recall:    0.7793
F1-Score:  0.7793

KNN:
Accuracy:  0.5919
Precision: 0.6113
Recall:    0.5919
F1-Score:  0.5734
