<a href="https://colab.research.google.com/github/Abidjahan1/sentiment_anlaysis/blob/main/logistic_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip install kagglehub[pandas-datasets]

from google.colab import files
files.upload()



In [7]:
#@title dataset uploading with kaggle API
# !mkdir -p ~/.kaggle
# !mv kaggle.json ~/.kaggle/
# !chmod 600 ~/.kaggle/kaggle.json

In [8]:
import kagglehub
import pandas as pd
import re


# Download the dataset using kagglehub
path = kagglehub.dataset_download("kazanova/sentiment140")

print("Dataset files downloaded to:", path)

# Construct the file path to the CSV
csv_path = f"{path}/training.1600000.processed.noemoticon.csv"



Dataset files downloaded to: /kaggle/input/sentiment140


In [None]:
#@title load csv file with custom names

# Load CSV with custom column names
columns = ['target', 'ids', 'date', 'flag', 'user_name', 'text']

# Load CSV with pandas
df = pd.read_csv(csv_path, encoding="latin-1", header=None,names = columns)
df.head(50)

In [None]:
 #@title Filter and Clean Data

 # Keep only positive (4) and negative (0) sentiments
df = df[df['target'].isin([0, 4])]

# Map sentiment values: 0 → 0 (negative), 4 → 1 (positive)
df['target'] = df['target'].map({0: 0, 4: 1})

# Drop unused columns
df = df[['text', 'target']]

# Check distribution
df['target'].value_counts()


In [None]:
#@title Preprocess Text

def clean_text(text):
    text = text.lower()                          # Lowercase all text
    text = re.sub(r'http\S+', '', text)          # Remove URLs
    text = re.sub(r'@\w+', '', text)             # Remove mentions
    text = re.sub(r'#\w+', '', text)             # Remove hashtags
    text = re.sub(r'[^a-z\s]', '', text)         # Remove punctuation/numbers
    return text.strip()

# Apply cleaning to all tweets
df['text'] = df['text'].apply(clean_text)

df.head()


In [None]:
#@title Better Text Preprocessing

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    words = [word for word in words if word not in stop_words]  # remove stopwords
    return " ".join(words)

# Apply cleaning to all tweets
df['text'] = df['text'].apply(clean_text)

df['text']

In [None]:
#@title  Split into Train and Test Sets
from sklearn.model_selection import train_test_split

X = df['text']
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test


In [None]:
#@title Text Vectorization with TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

# Create the vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit on training data and transform both sets
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

'''
TF-IDF converts text into numeric features by calculating:

Term Frequency (TF)

Inverse Document Frequency (IDF)

max_features=5000: keeps the top 5000 most important words.
Adds word pairs (bigrams) like not good, which help detect sarcasm or sentiment more accurately.

'''

In [32]:
#@title Try Class Balancing
from sklearn.utils import resample

# Separate classes
df_pos = df[df['target'] == 1]
df_neg = df[df['target'] == 0]

# Downsample majority to match minority
df_pos_down = resample(df_pos, replace=False, n_samples=len(df_neg), random_state=42)

# Combine again
df_balanced = pd.concat([df_neg, df_pos_down])


In [None]:
#@title Train Logistic Regression Model

from sklearn.linear_model import LogisticRegression

# Initialize the model
model = LogisticRegression()

# Train (fit) the model
model.fit(X_train_vec, y_train)

'''Logistic Regression is a linear model for binary classification.

fit() tells the model to learn the relationship between tweet vectors and sentiment labels.'''

In [None]:
#@title  Evaluate the Model

from sklearn.metrics import accuracy_score, classification_report

# Predict on test data
y_pred = model.predict(X_test_vec)

# Evaluate accuracy and metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

"""
Accuracy: 0.77364375

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.75      0.77    159494
           1       0.76      0.80      0.78    160506

    accuracy                           0.77    320000
   macro avg       0.77      0.77      0.77    320000
weighted avg       0.77      0.77      0.77    320000

"""


In [None]:
#@title Save Model and Vectorizer for Django
import pickle

# Save model
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save TF-IDF vectorizer
with open('tfidf.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)


'''
pickle  save Python objects to reuse later (in Django).

 load model.pkl inside Django to use the trained model.
'''