In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
import nltk
import pickle

# Load data
twitter_data = pd.read_csv('stemmed_twitterdata.csv')

# Handle missing values by dropping rows with NaN in 'stemmed_content'
twitter_data = twitter_data.dropna(subset=['stemmed_content'])

# Separating data and the labels
X = twitter_data['stemmed_content'].values
y = twitter_data['target'].values

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2)

# Converting textual data into numerical data
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)



In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Define the models
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier()
}

# Iterate over each model
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Training set performance
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred, average='binary')  
    train_recall = recall_score(y_train, y_train_pred, average='binary')        
    train_f1 = f1_score(y_train, y_train_pred, average='binary')
    train_roc_auc = roc_auc_score(y_train, y_train_pred)  
    train_conf_matrix = confusion_matrix(y_train, y_train_pred)

    # Test set performance
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred, average='binary')    
    test_recall = recall_score(y_test, y_test_pred, average='binary')          
    test_f1 = f1_score(y_test, y_test_pred, average='binary')
    test_roc_auc = roc_auc_score(y_test, y_test_pred)  
    test_conf_matrix = confusion_matrix(y_test, y_test_pred)

    # Print model performance
    print(model_name)

    print('Model Performance for training set:')
    print('- Accuracy Score: {:.4f}'.format(train_accuracy))
    print('- Precision Score: {:.4f}'.format(train_precision))
    print('- Recall Score: {:.4f}'.format(train_recall))
    print('- F1 Score: {:.4f}'.format(train_f1))
    print('- ROC AUC Score: {:.4f}'.format(train_roc_auc))
    print('- Confusion Matrix:\n', train_conf_matrix)

    print('\nModel Performance for testing set:')
    print('- Accuracy Score: {:.4f}'.format(test_accuracy))
    print('- Precision Score: {:.4f}'.format(test_precision))
    print('- Recall Score: {:.4f}'.format(test_recall))
    print('- F1 Score: {:.4f}'.format(test_f1))
    print('- ROC AUC Score: {:.4f}'.format(test_roc_auc))
    print('- Confusion Matrix:\n', test_conf_matrix)
    print('\n\n')


Logistic Regression
Model Performance for training set:
- Accuracy Score: 0.8024
- Precision Score: 0.7891
- Recall Score: 0.8255
- F1 Score: 0.8069
- ROC AUC Score: 0.8024
- Confusion Matrix:
 [[498625 141166]
 [111655 528158]]

Model Performance for testing set:
- Accuracy Score: 0.7783
- Precision Score: 0.7673
- Recall Score: 0.7990
- F1 Score: 0.7828
- ROC AUC Score: 0.7783
- Confusion Matrix:
 [[121180  38768]
 [ 32147 127806]]



