<a href="https://colab.research.google.com/github/EmilKJohn99/hello/blob/main/superfast_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import re

In [2]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [3]:
!kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
  0% 0.00/80.9M [00:00<?, ?B/s]
100% 80.9M/80.9M [00:00<00:00, 1.14GB/s]


In [4]:
from zipfile import ZipFile
dataset='/content/sentiment140.zip'

with ZipFile(dataset,'r') as zip:
  zip.extractall()
  print("dataset has been extracted")

dataset has been extracted


In [5]:
column_names = ['target', 'id', 'date', 'flag', 'user', 'text']
twitter_data = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', names=column_names, encoding='ISO-8859-1')


In [6]:
twitter_data['target'] = twitter_data['target'].replace(4, 1)


In [7]:
def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)  # Keep only letters
    text = text.lower()                   # Convert to lowercase
    return text

In [8]:
twitter_data['clean_text'] = twitter_data['text'].apply(clean_text)

In [9]:
X = twitter_data['clean_text'].values
Y = twitter_data['target'].values


In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)


In [11]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)  # limit features for speed
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [12]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, Y_train)


In [13]:
X_train_pred = model.predict(X_train_vec)
X_test_pred = model.predict(X_test_vec)

train_accuracy = accuracy_score(Y_train, X_train_pred)
test_accuracy = accuracy_score(Y_test, X_test_pred)

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

Training Accuracy: 0.7765
Test Accuracy: 0.7733


In [14]:
import pickle

In [15]:
filename="trained_model.sav"
pickle.dump(model,open(filename,'wb'))

In [16]:
loaded_model = pickle.load(open('/content/trained_model.sav','rb'))

In [21]:
test_input = ["I absolutely love this product! It works like a charm 😍"]

# Preprocess: clean and vectorize
import re
def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text).lower()
    return text

cleaned_input = [clean_text(text) for text in test_input]
input_vec = vectorizer.transform(cleaned_input)


In [22]:
prediction = model.predict(input_vec)

In [23]:
print("Sentiment:", "Positive" if prediction[0] == 1 else "Negative")

Sentiment: Positive
