In [39]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,f1_score
from sklearn.pipeline import Pipeline


## ðŸ”§ 1. Preprocessing and Setup

In [40]:
train =pd.read_csv('/train.csv')
test = pd.read_csv('/test.csv')

In [41]:
train.drop(columns=['id'], inplace=True)
test.drop(columns=['id'], inplace=True)

In [42]:
train.head()

Unnamed: 0,text,sentiment
0,Extremely easy user interface. Helpsnin using ...,positive
1,"Hey Luis, thanks! I have Flash and my prof`s...",positive
2,For sure you should continue to Tweet WHILE o...,negative
3,on the phone w. Chantellie ! <3,neutral
4,Clunky and very slow to load. Not fun at all -...,negative


In [43]:


import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess(text):
    text = text.lower()  # lowercase
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # remove links
    text = re.sub(r'\W', ' ', text)  # remove special characters
    text = re.sub(r'\s+', ' ', text)  # remove extra spaces
    tokens = text.split()
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

train['cleaned_text'] = train['text'].apply(preprocess)
test['cleaned_text'] = test['text'].apply(preprocess)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [44]:
print("NaN in text column:", train['text'].isna().sum())
print("NaN in sentiment column:", train['sentiment'].isna().sum())
print("Total rows with NaNs:", train[['text', 'sentiment']].isna().any(axis=1).sum())



NaN in text column: 0
NaN in sentiment column: 1
Total rows with NaNs: 1


In [45]:
train = train.dropna(subset=['text', 'sentiment'])


## ðŸ“Š 2. Vectorization

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=5, max_features=10000)

X = vectorizer.fit_transform(train['cleaned_text'])


In [47]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(train['sentiment'])  # e.g., positive, negative


## ðŸ§  3. Train-Test Split

In [48]:
from sklearn.model_selection import train_test_split

X_train_text, X_val_text, y_train, y_val = train_test_split(train['cleaned_text'], y, test_size=0.2, stratify=y, random_state=42)


In [49]:
print(type(X_train_text))        # should be something like <class 'pandas.core.series.Series'> or list of strings


<class 'pandas.core.series.Series'>


## ðŸš€ 4. Train a Logistic Regression Classifier

In [50]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(max_iter=1000))
])

param_grid = {
    'tfidf__ngram_range': [(1,1), (1,2)],
    'tfidf__max_df': [0.75, 0.85, 0.95],
    'tfidf__min_df': [3, 5],
    'tfidf__max_features': [5000, 10000],
    'clf__C': [0.1, 1, 10],
    'clf__class_weight': [None, 'balanced'],
    'clf__solver': ['liblinear', 'saga']
}

grid = GridSearchCV(pipe, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1, verbose=2)
grid.fit(train['cleaned_text'], y)  # y is your LabelEncoded sentiment

print("Best parameters:", grid.best_params_)


Fitting 5 folds for each of 288 candidates, totalling 1440 fits
Best parameters: {'clf__C': 1, 'clf__class_weight': 'balanced', 'clf__solver': 'liblinear', 'tfidf__max_df': 0.75, 'tfidf__max_features': 5000, 'tfidf__min_df': 3, 'tfidf__ngram_range': (1, 2)}


## ðŸ“ˆ 5. Evaluation

In [51]:
from sklearn.metrics import classification_report


grid.fit(X_train_text, y_train)
y_pred = grid.predict(X_val_text)  # predict on raw text

print(classification_report(y_val, y_pred, target_names=le.classes_))
print("Weighted F1 Score:", f1_score(y_val, y_pred, average='weighted'))


Fitting 5 folds for each of 288 candidates, totalling 1440 fits
              precision    recall  f1-score   support

    negative       0.61      0.56      0.58       445
     neutral       0.56      0.64      0.60       579
    positive       0.72      0.66      0.69       503

    accuracy                           0.62      1527
   macro avg       0.63      0.62      0.62      1527
weighted avg       0.63      0.62      0.62      1527

Weighted F1 Score: 0.6238569273180034


In [53]:
# Predict on test data (using cleaned text)
y_pred_test = grid.predict(test['cleaned_text'])

# Convert numeric predictions back to text labels (e.g. 0 â†’ 'positive')
submission_labels = le.inverse_transform(y_pred_test)

# Load the original test file again to get the IDs
test_ids = pd.read_csv('/test.csv')['id']

# Prepare final submission DataFrame
submission = pd.DataFrame({
    'id': test_ids,
    'Sentiment': submission_labels
})

submission.to_csv('analyse_submission.csv', index=False)
print("Submission file created: analyse_submission.csv")


Submission file created: analyse_submission.csv
