In [None]:
import pandas as pd
import numpy as np
import nltk
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
nltk.download('stopwords')
from nltk.corpus import stopwords


In [None]:
# ... existing code ...

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Encode the sentiment labels into numeric values
le = LabelEncoder()

# Transform the labels from strings to integers
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

# ... existing code ...

# For VotingClassifier, either remove SVC or modify it
svm_model = SVC(kernel='linear', class_weight='balanced', probability=True)  # Add probability=True

voting_clf = VotingClassifier(estimators=[
    ('xgb', xgb_model),
    ('rf', rf_model),
    ('svm', svm_model),  # Ensure SVC supports predict_proba
    ('dt', dt_model)
], voting='soft')

# ... existing code ...

In [None]:
# Load your dataset
df = pd.read_csv('tweet_emotions.csv')

# Preview the data
df.head()


In [None]:
# Function to clean tweets
def clean_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # remove urls
    text = re.sub(r'\@w+|\#','', text) # remove mentions and hashtags
    text = re.sub(r'[^A-Za-z0-9]+', ' ', text) # remove special characters
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text

# Apply the cleaning function
df['cleaned_content'] = df['content'].apply(clean_text)

# Define features (X) and labels (y)
X = df['cleaned_content']
y = df['sentiment']

# Encode the sentiment labels into numeric values
le = LabelEncoder()

# Transform the labels from strings to integers
y = le.fit_transform(y)
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)



In [None]:
xgb_model = XGBClassifier(objective='multi:softmax', num_class=len(le.classes_), eval_metric='mlogloss')
xgb_model.fit(X_train_tfidf, y_train_encoded)

# Predictions
y_pred_xgb = xgb_model.predict(X_test_tfidf)

# Convert numeric predictions back to original labels
y_pred_xgb_labels = le.inverse_transform(y_pred_xgb)

# Evaluation
print("XGBoost Model Performance:\n")
print(classification_report(y_test, y_pred_xgb_labels))



In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)

# Predictions
y_pred_rf = rf_model.predict(X_test_tfidf)

# Evaluation
print("Random Forest Model Performance:\n")
print(classification_report(y_test, y_pred_rf))


In [None]:
svm_model = SVC(kernel='linear', class_weight='balanced')
svm_model.fit(X_train_tfidf, y_train)

# Predictions
y_pred_svm = svm_model.predict(X_test_tfidf)

# Evaluation
print("SVM Model Performance:\n")
print(classification_report(y_test, y_pred_svm))


In [None]:
dt_model = DecisionTreeClassifier(max_depth=10, random_state=42)
dt_model.fit(X_train_tfidf, y_train)

# Predictions
y_pred_dt = dt_model.predict(X_test_tfidf)

# Evaluation
print("Decision Tree Model Performance:\n")
print(classification_report(y_test, y_pred_dt))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10,7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y_true), yticklabels=np.unique(y_true))
    plt.title(f"Confusion Matrix for {model_name}")
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

# Confusion matrices
plot_confusion_matrix(y_test, y_pred_xgb, "XGBoost")
plot_confusion_matrix(y_test, y_pred_rf, "Random Forest")
plot_confusion_matrix(y_test, y_pred_svm, "SVM")
plot_confusion_matrix(y_test, y_pred_dt, "Decision Tree")


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': [100, 200], 'max_depth': [10, 20]}
grid_search = GridSearchCV(rf_model, param_grid, cv=3, scoring='accuracy', verbose=1)
grid_search.fit(X_train_tfidf, y_train)

# Best parameters and score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_}")


In [None]:
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(estimators=[
    ('xgb', xgb_model),
    ('rf', rf_model),
    ('svm', svm_model),
    ('dt', dt_model)
], voting='soft')

voting_clf.fit(X_train_tfidf, y_train)
y_pred_voting = voting_clf.predict(X_test_tfidf)

print(classification_report(y_test, y_pred_voting))


In [None]:
from sklearn.model_selection import cross_val_score

# For XGBoost
cross_val_scores = cross_val_score(xgb_model, X_train_tfidf, y_train, cv=5, scoring='accuracy')
print(f"Mean Cross-Validation Accuracy: {np.mean(cross_val_scores)}")
