In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('archive-2/twitter_training.csv', header=None)
# Let's provide some generic column names based on the context.
# From the tutorial descriptions, the columns are likely: Tweet ID, Entity, Sentiment, and Tweet Content.
df.columns = ['Tweet ID', 'Entity', 'Sentiment', 'Tweet Content']

# Display the first few rows of the dataframe
print("First 5 rows of the dataset:")
print(df.head())

First 5 rows of the dataset:
   Tweet ID       Entity Sentiment  \
0      2401  Borderlands  Positive   
1      2401  Borderlands  Positive   
2      2401  Borderlands  Positive   
3      2401  Borderlands  Positive   
4      2401  Borderlands  Positive   

                                       Tweet Content  
0  I am coming to the borders and I will kill you...  
1  im getting on borderlands and i will kill you ...  
2  im coming on borderlands and i will murder you...  
3  im getting on borderlands 2 and i will murder ...  
4  im getting into borderlands and i can murder y...  


In [2]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Download necessary NLTK data (if you haven't already)
nltk.download('punkt')
nltk.download('stopwords')

# --- Preprocessing Steps ---

# 1. Handle Missing Values
df['Tweet Content'].fillna('', inplace=True)

# 2. Lowercase Conversion
df['cleaned_text'] = df['Tweet Content'].str.lower()

# 3. Punctuation Removal
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# 4. Tokenization
df['tokenized_text'] = df['cleaned_text'].apply(word_tokenize)

# 5. Stop Word Removal
stop_words = set(stopwords.words('english'))
df['tokenized_text'] = df['tokenized_text'].apply(lambda x: [word for word in x if word not in stop_words])

# 6. Stemming
stemmer = PorterStemmer()
df['stemmed_text'] = df['tokenized_text'].apply(lambda x: [stemmer.stem(word) for word in x])

# Join the stemmed words back into a single string
df['processed_text'] = df['stemmed_text'].apply(lambda x: ' '.join(x))

# Display the DataFrame with the new processed text column
print(df[['Tweet Content', 'processed_text']].head())

[nltk_data] Downloading package punkt to /Users/muhdajwd/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/muhdajwd/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Tweet Content'].fillna('', inplace=True)


                                       Tweet Content  \
0  I am coming to the borders and I will kill you...   
1  im getting on borderlands and i will kill you ...   
2  im coming on borderlands and i will murder you...   
3  im getting on borderlands 2 and i will murder ...   
4  im getting into borderlands and i can murder y...   

               processed_text  
0            come border kill  
1      im get borderland kill  
2   im come borderland murder  
3  im get borderland 2 murder  
4    im get borderland murder  


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Define features (X) and target (y)
X = df['processed_text']
y = df['Sentiment']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize a TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000) # You can adjust max_features

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

# --- Train Random Forest Classifier ---
print("Training Random Forest Classifier...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)  # n_jobs=-1 uses all available CPU cores
rf_model.fit(X_train_tfidf, y_train)
print("Random Forest training completed.")

# --- Train Naive Bayes Classifier ---
print("\nTraining Naive Bayes Classifier...")
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
print("Naive Bayes training completed.")

Training Random Forest Classifier...
Random Forest training completed.

Training Naive Bayes Classifier...
Naive Bayes training completed.


In [5]:
from sklearn.metrics import accuracy_score, classification_report

# --- Evaluate Random Forest Classifier ---
print("\n--- Random Forest Evaluation ---")
y_pred_rf = rf_model.predict(X_test_tfidf)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy: {accuracy_rf:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))

# --- Evaluate Naive Bayes Classifier ---
print("\n--- Naive Bayes Evaluation ---")
y_pred_nb = nb_model.predict(X_test_tfidf)
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f"Accuracy: {accuracy_nb:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_nb))


--- Random Forest Evaluation ---
Accuracy: 0.8800

Classification Report:
              precision    recall  f1-score   support

  Irrelevant       0.95      0.80      0.87      2598
    Negative       0.91      0.90      0.90      4509
     Neutral       0.81      0.90      0.85      3664
    Positive       0.88      0.89      0.89      4166

    accuracy                           0.88     14937
   macro avg       0.89      0.87      0.88     14937
weighted avg       0.88      0.88      0.88     14937


--- Naive Bayes Evaluation ---
Accuracy: 0.6339

Classification Report:
              precision    recall  f1-score   support

  Irrelevant       0.74      0.36      0.48      2598
    Negative       0.61      0.80      0.70      4509
     Neutral       0.66      0.51      0.58      3664
    Positive       0.62      0.73      0.67      4166

    accuracy                           0.63     14937
   macro avg       0.66      0.60      0.61     14937
weighted avg       0.65      0.63    

In [6]:
def predict_sentiment(text, model, vectorizer, stemmer_obj):
    # Apply the same preprocessing steps
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    
    stop_words_set = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words_set]
    tokens = [stemmer_obj.stem(word) for word in tokens]
    processed_text = ' '.join(tokens)
    
    # Vectorize the text using the fitted vectorizer
    vectorized_text = vectorizer.transform([processed_text])
    
    # Predict the sentiment using the provided model
    prediction = model.predict(vectorized_text)
    
    return prediction[0]

# --- Example Usage ---
new_tweet_positive = "I am absolutely loving this new update, the developers did a fantastic job!"
new_tweet_negative = "This is the worst game I have ever played, it's full of bugs and crashes constantly."

print("\n--- Predictions on New Tweets ---")

# Get predictions from both models for the positive tweet
print(f"Tweet: '{new_tweet_positive}'")
print(f"  - Random Forest Prediction: {predict_sentiment(new_tweet_positive, rf_model, tfidf_vectorizer, stemmer)}")
print(f"  - Naive Bayes Prediction:     {predict_sentiment(new_tweet_positive, nb_model, tfidf_vectorizer, stemmer)}")

# Get predictions from both models for the negative tweet
print(f"\nTweet: '{new_tweet_negative}'")
print(f"  - Random Forest Prediction: {predict_sentiment(new_tweet_negative, rf_model, tfidf_vectorizer, stemmer)}")
print(f"  - Naive Bayes Prediction:     {predict_sentiment(new_tweet_negative, nb_model, tfidf_vectorizer, stemmer)}")


--- Predictions on New Tweets ---
Tweet: 'I am absolutely loving this new update, the developers did a fantastic job!'
  - Random Forest Prediction: Positive
  - Naive Bayes Prediction:     Positive

Tweet: 'This is the worst game I have ever played, it's full of bugs and crashes constantly.'
  - Random Forest Prediction: Negative
  - Naive Bayes Prediction:     Negative


In [7]:
# --- Step 7: Visualizing Model Performance ---
print("\n--- Step 7: Generating Visualizations ---")

labels = sorted(y.unique())

# Function to plot a confusion matrix
def plot_confusion_matrix(y_true, y_pred, model_name, labels):
    mat = confusion_matrix(y_true, y_pred, labels=labels)
    plt.figure(figsize=(8, 6))
    sns.heatmap(mat, annot=True, fmt='d', cmap='Blues',
                xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix - {model_name}')
    filename = f'confusion_matrix_{model_name.lower().replace(" ", "_")}.png'
    plt.savefig(filename)
    plt.close()
    print(f"Saved {filename}")

# Plot for Random Forest
plot_confusion_matrix(y_test, y_pred_rf, 'Random Forest', labels)

# Plot for Naive Bayes
plot_confusion_matrix(y_test, y_pred_nb, 'Naive Bayes', labels)

# Compare Performance Metrics (F1-Score)
report_rf = classification_report(y_test, y_pred_rf, output_dict=True)
report_nb = classification_report(y_test, y_pred_nb, output_dict=True)

f1_rf = {label: report_rf[label]['f1-score'] for label in labels if label in report_rf}
f1_nb = {label: report_nb[label]['f1-score'] for label in labels if label in report_nb}

f1_df = pd.DataFrame({'Random Forest': f1_rf, 'Naive Bayes': f1_nb})
f1_df.plot(kind='bar', figsize=(12, 7), rot=0)
plt.title('F1-Score Comparison by Sentiment Class')
plt.ylabel('F1-Score')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('f1_score_comparison.png')
plt.close()
print("Saved f1_score_comparison.png")

# Visualize Sentiment Distribution
actual_counts = y_test.value_counts()
predicted_rf_counts = pd.Series(y_pred_rf).value_counts()
predicted_nb_counts = pd.Series(y_pred_nb).value_counts()

dist_df = pd.DataFrame({
    'Actual': actual_counts,
    'Random Forest': predicted_rf_counts,
    'Naive Bayes': predicted_nb_counts
}).fillna(0).astype(int).loc[labels]

dist_df.plot(kind='bar', figsize=(14, 8), rot=0)
plt.title('Distribution of Sentiments in Test Set (Actual vs. Predicted)')
plt.ylabel('Number of Tweets')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('prediction_distribution_comparison.png')
plt.close()
print("Saved prediction_distribution_comparison.png")


--- Step 7: Generating Visualizations ---
Saved confusion_matrix_random_forest.png
Saved confusion_matrix_naive_bayes.png
Saved f1_score_comparison.png
Saved prediction_distribution_comparison.png
