In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# Load the data
df = pd.read_csv('Tweets.csv')

In [None]:
# Preprocess text (basic cleaning)
def preprocess_text(text):
    text = text.lower()
    text = ''.join([c for c in text if c.isalpha() or c.isspace()])
    return text

df['cleaned_tweet'] = df['tweet'].apply(preprocess_text)

In [None]:
# Split data into training and testing sets
X = df['cleaned_tweet']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
# Initialize classifiers
classifiers = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(kernel='linear'),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Decision Tree": DecisionTreeClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier()
}

In [None]:
# Train and evaluate each classifier
results = []
for name, clf in classifiers.items():
    # Train
    clf.fit(X_train_vec, y_train)

    # Predict
    y_pred = clf.predict(X_test_vec)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    # Print metrics
    print(f"Metrics for {name}:")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

    # Store results
    results.append({
      'Classifier': name,
      'Accuracy': accuracy,
      'Precision': precision,
      'Recall': recall,
      'F1 Score': f1,
      'Confusion Matrix': cm
      })

Metrics for Naive Bayes:
Accuracy: 0.759125
Precision: 0.7636316872427984
Recall: 0.7465426200653759
F1 Score: 0.7549904640813732
Metrics for Logistic Regression:
Accuracy: 0.774
Precision: 0.7656135194709772
Recall: 0.7860196127734473
F1 Score: 0.775682382133995
Metrics for SVM:
Accuracy: 0.77025
Precision: 0.7624539877300613
Recall: 0.7812421423183304
F1 Score: 0.7717337307501242
Metrics for Random Forest:
Accuracy: 0.743875
Precision: 0.7480699948533196
Recall: 0.7309529796328891
F1 Score: 0.7394124380007631
Metrics for Decision Tree:
Accuracy: 0.6775
Precision: 0.672511731291677
Recall: 0.6846869499622831
F1 Score: 0.6785447296287067
Metrics for K-Nearest Neighbors:
Accuracy: 0.561875
Precision: 0.5420977524081342
Recall: 0.7641438270052804
F1 Score: 0.6342481477616613


In [None]:
 # Convert results to DataFrame
results_df = pd.DataFrame(results)

In [None]:
results_df

Unnamed: 0,Classifier,Accuracy,Precision,Recall,F1 Score,Confusion Matrix
0,K-Nearest Neighbors,0.561875,0.542098,0.764144,0.634248,"[[1456, 2567], [938, 3039]]"
