In [3]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

In [5]:
# Load the dataset
train_data = pd.read_csv('/content/twitter_training.csv',header=None)
validation_data = pd.read_csv('/content/twitter_validation.csv',header=None)

In [6]:
column_names = ['id', 'Borderlands' ,'label', 'tweet']
train_data = pd.read_csv('twitter_training.csv',header=None, names=column_names)
validation_data = pd.read_csv('twitter_validation.csv',header=None, names=column_names)

In [7]:
train_data['tweet'] = train_data['tweet'].astype(str).fillna('')
validation_data['tweet'] = validation_data['tweet'].astype(str).fillna('')

In [8]:
# Function to clean text
def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    return text

train_data['cleaned_text'] = train_data['tweet'].apply(clean_text)
validation_data['cleaned_text'] = validation_data['tweet'].apply(clean_text)

In [9]:
train_data['sentiment'] = train_data['label'].map({'Positive': 1, 'Neutral': 0, 'Negative': -1})
validation_data['sentiment'] = validation_data['label'].map({'Positive': 1, 'Neutral': 0, 'Negative': -1})

In [10]:
train_data.dropna(inplace=True)
validation_data.dropna(inplace=True)

In [11]:
train_data = train_data[train_data['cleaned_text'].str.strip() != '']
validation_data = validation_data[validation_data['cleaned_text'].str.strip() != '']

In [12]:
X_train = train_data['cleaned_text']
y_train = train_data['sentiment']

X_test = validation_data['cleaned_text']
y_test = validation_data['sentiment']

In [13]:
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [15]:
# Train a logistic regression model
model = LogisticRegression(max_iter=1000, C=0.1, penalty='l2')
model.fit(X_train_tfidf, y_train)

# Predict
y_pred = model.predict(X_test_tfidf)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
print(cm)


Classification Report:
              precision    recall  f1-score   support

        -1.0       0.69      0.86      0.77       266
         0.0       0.85      0.61      0.71       285
         1.0       0.80      0.84      0.82       277

    accuracy                           0.77       828
   macro avg       0.78      0.77      0.76       828
weighted avg       0.78      0.77      0.76       828

[[228  19  19]
 [ 70 175  40]
 [ 32  13 232]]
