In [27]:
import re as re
import heapq as heapq
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import random as random
from sklearn.preprocessing import StandardScaler
import pyarrow as pa
import pyarrow.parquet as pq
from sklearn.decomposition import PCA
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics 

In [28]:
tweets = pd.read_csv('CleanedTweets.csv')
tweets['processed_text'] = tweets['processed_text'].fillna('')

In [29]:
tweets.head()

Unnamed: 0.1,Unnamed: 0,sentiment,text,mentions,hashtags,urls,exclamations,emoticons,ellipsis,word_count,processed_text
0,0,0,"- awww, that's a bummer. you shoulda got da...",1,0,1,0,1,0,19,"- awww , bummer . shoulda got david carr third..."
1,1,0,is upset that he can't update his facebook by ...,0,0,0,1,0,1,21,is upset can't update facebook texting ... mig...
2,2,0,i dived many times for the ball. managed to s...,1,0,0,0,0,0,18,dived many time ball . managed save 50 % rest ...
3,3,0,my whole body feels itchy and like its on fire,0,0,0,0,0,0,10,whole body feel itchy like fire
4,4,0,"no, it's not behaving at all. i'm mad. why am...",1,0,0,0,0,0,21,"no , not behaving . mad . ? can't see ."


## Encodings

### Bag of Words Encoding

In [30]:
def encode_bow(X_train, X_test, ngram_range=(1, 1), max_features=5000):
    vectorizer = CountVectorizer(
        max_features=max_features,
        ngram_range=ngram_range
    )
    
    X_train_bow = vectorizer.fit_transform(X_train['processed_text'])
    X_test_bow = vectorizer.transform(X_test['processed_text'])
    
    return X_train_bow, X_test_bow, vectorizer

### TF-IDF Encoding

In [31]:
def encode_tfidf(X_train, X_test, ngram_range=(1, 1), max_features=5000):
    vectorizer = TfidfVectorizer(
        max_features=max_features,
        ngram_range=ngram_range
    )
    
    X_train_tfidf = vectorizer.fit_transform(X_train['processed_text'])
    X_test_tfidf = vectorizer.transform(X_test['processed_text'])

    return X_train_tfidf, X_test_tfidf, vectorizer

### VADERS Encoding

In [32]:
nltk.download('vader_lexicon', quiet=True)

def encode_vader(X_train, X_test):
    vader = SentimentIntensityAnalyzer()
    
    def extract_vader_scores(texts):
        compound_scores = []
        positive_scores = []
        negative_scores = []
        neutral_scores = []
        
        for text in texts:
            scores = vader.polarity_scores(str(text))
            compound_scores.append(scores['compound'])
            positive_scores.append(scores['pos'])
            negative_scores.append(scores['neg'])
            neutral_scores.append(scores['neu'])
        
        return pd.DataFrame({
            'vader_compound': compound_scores,
            'vader_positive': positive_scores,
            'vader_negative': negative_scores,
            'vader_neutral': neutral_scores
        })
    
    X_train_vader = extract_vader_scores(X_train['text'])
    X_test_vader = extract_vader_scores(X_test['text'])
    
    return X_train_vader, X_test_vader

### Split DataSet

In [33]:
X = tweets.drop(columns = ['sentiment'])
y = tweets['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Create Cleaned DataSet and encoding them 

In [None]:
def create_fold_datasets(X_fold_train, X_fold_val):
    # Get original features
    feature_cols = ['mentions', 'hashtags', 'urls', 'exclamations', 'emoticons', 'ellipsis']
    X_fold_train_features = X_fold_train[feature_cols].copy()
    X_fold_val_features = X_fold_val[feature_cols].copy()
    
    # Create datasets dictionary
    fold_datasets = {}
    
    # 1. Original features only
    fold_datasets['original'] = (X_fold_train_features, X_fold_val_features)
    
    # 2. BOW features only (Unigram and Bigrams)
    bow_vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 2))
    X_fold_train_bow = bow_vectorizer.fit_transform(X_fold_train['processed_text'])
    X_fold_val_bow = bow_vectorizer.transform(X_fold_val['processed_text'])
    
    # Convert to DataFrame
    bow_train_df = pd.DataFrame(
        X_fold_train_bow.toarray(),
        columns=[f'bow_{i}' for i in range(X_fold_train_bow.shape[1])]
    )
    bow_val_df = pd.DataFrame(
        X_fold_val_bow.toarray(),
        columns=[f'bow_{i}' for i in range(X_fold_train_bow.shape[1])]
    )
    
    fold_datasets['bow_only'] = (bow_train_df, bow_val_df)
    
    # 3. TF-IDF features only (Unigram and Bigrams)
    tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
    X_fold_train_tfidf = tfidf_vectorizer.fit_transform(X_fold_train['processed_text'])
    X_fold_val_tfidf = tfidf_vectorizer.transform(X_fold_val['processed_text'])
    
    # Convert to DataFrame
    tfidf_train_df = pd.DataFrame(
        X_fold_train_tfidf.toarray(),
        columns=[f'tfidf_{i}' for i in range(X_fold_train_tfidf.shape[1])]
    )
    tfidf_val_df = pd.DataFrame(
        X_fold_val_tfidf.toarray(),
        columns=[f'tfidf_{i}' for i in range(X_fold_train_tfidf.shape[1])]
    )
    
    fold_datasets['tfidf_only'] = (tfidf_train_df, tfidf_val_df)
    
    # 4. VADER features only
    vader = SentimentIntensityAnalyzer()
    
    def get_vader_scores(texts):
        scores_df = pd.DataFrame()
        scores_df['vader_compound'] = [vader.polarity_scores(str(t))['compound'] for t in texts]
        scores_df['vader_pos'] = [vader.polarity_scores(str(t))['pos'] for t in texts]
        scores_df['vader_neg'] = [vader.polarity_scores(str(t))['neg'] for t in texts]
        scores_df['vader_neu'] = [vader.polarity_scores(str(t))['neu'] for t in texts]
        return scores_df
    
    X_fold_train_vader = get_vader_scores(X_fold_train['text'])
    X_fold_val_vader = get_vader_scores(X_fold_val['text'])
    
    fold_datasets['vader_only'] = (X_fold_train_vader, X_fold_val_vader)
    
    # 5. Combined datasets
    # BOW + original
    X_fold_train_bow_combined = pd.concat([X_fold_train_features.reset_index(drop=True), 
                                         bow_train_df.reset_index(drop=True)], axis=1)
    X_fold_val_bow_combined = pd.concat([X_fold_val_features.reset_index(drop=True), 
                                       bow_val_df.reset_index(drop=True)], axis=1)
    
    fold_datasets['bow_combined'] = (X_fold_train_bow_combined, X_fold_val_bow_combined)
    
    # TF-IDF + original
    X_fold_train_tfidf_combined = pd.concat([X_fold_train_features.reset_index(drop=True), 
                                           tfidf_train_df.reset_index(drop=True)], axis=1)
    X_fold_val_tfidf_combined = pd.concat([X_fold_val_features.reset_index(drop=True), 
                                         tfidf_val_df.reset_index(drop=True)], axis=1)
    
    fold_datasets['tfidf_combined'] = (X_fold_train_tfidf_combined, X_fold_val_tfidf_combined)
    
    # VADER + original
    X_fold_train_vader_combined = pd.concat([X_fold_train_features.reset_index(drop=True), 
                                           X_fold_train_vader.reset_index(drop=True)], axis=1)
    X_fold_val_vader_combined = pd.concat([X_fold_val_features.reset_index(drop=True), 
                                         X_fold_val_vader.reset_index(drop=True)], axis=1)
    
    fold_datasets['vader_combined'] = (X_fold_train_vader_combined, X_fold_val_vader_combined)
    
    

    # Return all datasets for this fold
    return fold_datasets

### Random Forest

In [None]:
def train_random_forest_with_kfold(X, y, n_folds=5, n_estimator = 100):
    # Dictionary to store all results
    all_results = {}
    
    # Create KFold object
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    
    # List of datasets to evaluate
    dataset_names = [
        'original',          # Just original features
        'bow_only',          # Just BOW features
        'tfidf_only',        # Just TF-IDF features
        'vader_only',        # Just VADER features
        'bow_combined',      # BOW + original features
        'tfidf_combined',    # TF-IDF + original features
        'vader_combined'     # VADER + original features
    ]

    for name in dataset_names:
        all_results[name] = {
            'dataset': name,
            'cv_scores': [],
            'all_predictions': np.array([]),
            'all_true': np.array([])
        }
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        print(f"\nProcessing fold {fold+1}/{n_folds}...")
        
        # Split the data
        X_fold_train, X_fold_val = X.iloc[train_idx], X.iloc[val_idx]
        y_fold_train, y_fold_val = y.iloc[train_idx], y.iloc[val_idx]
        
        # Generate DataSet for each fold to prevent leakage between train and test
        fold_datasets = create_fold_datasets(X_fold_train, X_fold_val)

        # Process each dataset type
        for dataset_name in dataset_names:
            print(f"  Training on {dataset_name}...")

            X_fold_train_data, X_fold_val_data = fold_datasets[dataset_name]
            
            # Train Random Forest on this fold and dataset
            rf = RandomForestClassifier(n_estimators=n_estimator, random_state=42)
            
            rf.fit(X_fold_train_data, y_fold_train)

            
            # Make predictions on validation fold
            fold_preds = rf.predict(X_fold_val_data)
            fold_score = accuracy_score(y_fold_val, fold_preds)
            
            # Store results for this dataset type
            all_results[dataset_name]['acc_scores'].append(fold_score)
            all_results[dataset_name]['all_predictions'] = np.append(
                all_results[dataset_name]['all_predictions'], fold_preds
            )
            all_results[dataset_name]['all_true'] = np.append(
                all_results[dataset_name]['all_true'], y_fold_val
            )
            
            # Feature importance
            if hasattr(rf, 'feature_importances_') and fold == 0:
                importances = rf.feature_importances_
                indices = np.argsort(importances)[::-1]
                features = X_fold_train_data.columns
                
                # Store top features
                top_features = []
                for i in range(min(10, len(features))):
                    feature_idx = indices[i]
                    top_features.append((features[feature_idx], importances[feature_idx]))
                
                all_results[dataset_name]['top_features'] = top_features
            

    
    # Calculate final metrics for each dataset type
    for name in dataset_names:
        # Calculate mean
        all_results[name]['mean_accuracy'] = np.mean(all_results[name]['acc_scores'])
        all_results[name]['std_accuracy'] = np.std(all_results[name]['acc_scores'])
        
        # Calculate confusion matrix
        all_results[name]['confusion_matrix'] = confusion_matrix(
            all_results[name]['all_true'], all_results[name]['all_predictions']
        )
        
        # Print results
        print(f"\n--- Results for {name} ---")
        print(f"accuracies: {all_results[name]['acc_scores']}")
        print(f"Mean 5-fold accuracy: {all_results[name]['mean_accuracy']:.4f}")
        
        # Print classification report
        print("\nClassification Report:")
        print(classification_report(all_results[name]['all_true'], all_results[name]['all_predictions']))
        
        # Print top features
        if 'top_features' in all_results[name]:
            print("\nTop 10 features from first fold:")
            for i, (feature, importance) in enumerate(all_results[name]['top_features'], 1):
                print(f"{i}. {feature}: {importance:.4f}")
    
    return all_results

In [None]:
# Train RandomForest with 100 estimators

n_folds = 5
n_estimator = 100
results = train_random_forest_with_kfold(X_train, y_train, n_folds, n_estimator = 100)



Processing fold 1/5...
  Training on original...
  Training on bow_only...
  Training on tfidf_only...
  Training on vader_only...
  Training on bow_combined...
  Training on tfidf_combined...
  Training on vader_combined...

Processing fold 2/5...
  Training on original...
  Training on bow_only...
  Training on tfidf_only...
  Training on vader_only...
  Training on bow_combined...
  Training on tfidf_combined...
  Training on vader_combined...

Processing fold 3/5...
  Training on original...
  Training on bow_only...
  Training on tfidf_only...
  Training on vader_only...
  Training on bow_combined...
  Training on tfidf_combined...
  Training on vader_combined...

Processing fold 4/5...
  Training on original...
  Training on bow_only...
  Training on tfidf_only...
  Training on vader_only...
  Training on bow_combined...
  Training on tfidf_combined...
  Training on vader_combined...

Processing fold 5/5...
  Training on original...
  Training on bow_only...
  Training on tfidf_

In [None]:
# Print summary of all results
print("\n=== SUMMARY OF RESULTS ===")
for dataset_name in sorted(results.keys(), key=lambda k: results[k]['mean_accuracy'], reverse=True):
    result = results[dataset_name]
    print(f"{dataset_name}: Accuracy = {result['mean_accuracy']:.4f} ± {result['std_accuracy']:.4f}, ")

# Identify best dataset
best_dataset = max(results.keys(), key=lambda k: results[k]['mean_accuracy'])
print(f"\nBest dataset: {best_dataset} with accuracy {results[best_dataset]['mean_accuracy']:.4f} ± " +
        f"{results[best_dataset]['std_accuracy']:.4f}")


=== SUMMARY OF RESULTS ===
tfidf_combined: CV Accuracy = 0.7577 ± 0.0044, 
tfidf_only: CV Accuracy = 0.7549 ± 0.0031, 
bow_combined: CV Accuracy = 0.7534 ± 0.0027, 
bow_only: CV Accuracy = 0.7504 ± 0.0027, 
vader_combined: CV Accuracy = 0.6790 ± 0.0028, 
vader_only: CV Accuracy = 0.6489 ± 0.0040, 
original: CV Accuracy = 0.6104 ± 0.0023, 

Best dataset: tfidf_combined with CV accuracy 0.7577 ± 0.0044


In [40]:
# Train RandomForest with 500 estimators

n_folds = 5
n_estimator = 500
results = train_random_forest_with_kfold(X_train, y_train, n_folds, n_estimator)


Processing fold 1/5...
  Training on original...
  Training on bow_only...
  Training on tfidf_only...
  Training on vader_only...
  Training on bow_combined...
  Training on tfidf_combined...
  Training on vader_combined...

Processing fold 2/5...
  Training on original...
  Training on bow_only...
  Training on tfidf_only...
  Training on vader_only...
  Training on bow_combined...
  Training on tfidf_combined...
  Training on vader_combined...

Processing fold 3/5...
  Training on original...
  Training on bow_only...
  Training on tfidf_only...
  Training on vader_only...
  Training on bow_combined...
  Training on tfidf_combined...
  Training on vader_combined...

Processing fold 4/5...
  Training on original...
  Training on bow_only...
  Training on tfidf_only...
  Training on vader_only...
  Training on bow_combined...
  Training on tfidf_combined...
  Training on vader_combined...

Processing fold 5/5...
  Training on original...
  Training on bow_only...
  Training on tfidf_

In [None]:
# Print summary of all results
print("\n=== SUMMARY OF RESULTS ===")
for dataset_name in sorted(results.keys(), key=lambda k: results[k]['mean_accuracy'], reverse=True):
    result = results[dataset_name]
    print(f"{dataset_name}: Accuracy = {result['mean_accuracy']:.4f} ± {result['std_accuracy']:.4f}, ")

# Identify best dataset
best_dataset = max(results.keys(), key=lambda k: results[k]['mean_accuracy'])
print(f"\nBest dataset: {best_dataset} with accuracy {results[best_dataset]['mean_accuracy']:.4f} ± " +
        f"{results[best_dataset]['std_accuracy']:.4f}")


=== SUMMARY OF RESULTS ===
tfidf_combined: Accuracy = 0.7589 ± 0.0043, 
tfidf_only: Accuracy = 0.7568 ± 0.0033, 
bow_combined: Accuracy = 0.7562 ± 0.0032, 
bow_only: Accuracy = 0.7512 ± 0.0027, 
vader_combined: Accuracy = 0.6801 ± 0.0028, 
vader_only: Accuracy = 0.6501 ± 0.0037, 
original: Accuracy = 0.6102 ± 0.0022, 

Best dataset: tfidf_combined with accuracy 0.7589 ± 0.0043


TF-IDF would be chosen as the encoding to train with 