<a href="https://colab.research.google.com/github/ArkodevMukherjee/Sentiment-Analysis/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/imdb-dataset-of-50k-movie-reviews


In [2]:
import pandas as pd

df = pd.read_csv("..//kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

In [3]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [4]:
import re
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download("stopwords")

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = re.sub(r"<.*?>", "", text)  # remove HTML
    text = re.sub(r"[^a-zA-Z]", " ", text)  # remove non-letters
    text = text.lower()
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
nltk.download("wordnet")
df["review"] = df["review"].apply(clean_text)

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [6]:
df

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching oz episode hoo...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically family little boy jake think zombie ...,negative
4,petter mattei love time money visually stunnin...,positive
...,...,...
49995,thought movie right good job creative original...,positive
49996,bad plot bad dialogue bad acting idiotic direc...,negative
49997,catholic taught parochial elementary school nu...,negative
49998,going disagree previous comment side maltin on...,negative


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=5000,       # only top 5000 words
    min_df=5,                # must appear in at least 5 documents
    max_df=0.8,              # ignore too-common words
    ngram_range=(1,1),       # only unigrams (words)
    stop_words='english'     # remove common stop words
)
X = vectorizer.fit_transform(df["review"])

In [9]:
Y = df["sentiment"]

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42, stratify=Y
)

In [11]:
vectorizer.get_feature_names_out()
print(len(vectorizer.get_feature_names_out()
# ['acting', 'and', 'boring', 'direction', 'fantastic', 'good', 'love', 'movie', 'not', 'terrible', 'this', 'very', 'was']
))
# ['acting', 'and', 'boring', 'direction', 'fantastic', 'good', 'love', 'movie', 'not', 'terrible', 'this', 'very', 'was']

5000


In [12]:
import pandas as pd

df_tfidf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
print(df_tfidf.head())

   aaron  abandoned  abc  ability  able  abraham  abrupt  absence  absent  \
0    0.0        0.0  0.0      0.0   0.0      0.0     0.0      0.0     0.0   
1    0.0        0.0  0.0      0.0   0.0      0.0     0.0      0.0     0.0   
2    0.0        0.0  0.0      0.0   0.0      0.0     0.0      0.0     0.0   
3    0.0        0.0  0.0      0.0   0.0      0.0     0.0      0.0     0.0   
4    0.0        0.0  0.0      0.0   0.0      0.0     0.0      0.0     0.0   

   absolute  ...  yes  yesterday      york     young  younger  youngster  \
0       0.0  ...  0.0        0.0  0.000000  0.000000      0.0        0.0   
1       0.0  ...  0.0        0.0  0.000000  0.000000      0.0        0.0   
2       0.0  ...  0.0        0.0  0.000000  0.079911      0.0        0.0   
3       0.0  ...  0.0        0.0  0.000000  0.000000      0.0        0.0   
4       0.0  ...  0.0        0.0  0.104782  0.000000      0.0        0.0   

   youth  zero    zombie  zone  
0    0.0   0.0  0.000000   0.0  
1    0.0   0.0

In [13]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train,Y_train)

In [14]:
from sklearn.metrics import accuracy_score

# Evaluate on training data
Y_train_pred = model.predict(X_train)
train_acc = accuracy_score(Y_train, Y_train_pred)

# Evaluate on test data
Y_test_pred = model.predict(X_test)
test_acc = accuracy_score(Y_test, Y_test_pred)

print("Train Accuracy:", train_acc)
print("Test Accuracy:", test_acc)

Train Accuracy: 0.90825
Test Accuracy: 0.885


In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Predict
Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)

# Train metrics
train_accuracy = accuracy_score(Y_train, Y_train_pred)
train_precision = precision_score(Y_train, Y_train_pred, pos_label="positive")
train_recall = recall_score(Y_train, Y_train_pred, pos_label="positive")
train_f1 = f1_score(Y_train, Y_train_pred, pos_label="positive")

# Test metrics
test_accuracy = accuracy_score(Y_test, Y_test_pred)
test_precision = precision_score(Y_test, Y_test_pred, pos_label="positive")
test_recall = recall_score(Y_test, Y_test_pred, pos_label="positive")
test_f1 = f1_score(Y_test, Y_test_pred, pos_label="positive")

# Print results
print("Train Metrics")
print(f"Accuracy : {train_accuracy:.4f}")
print(f"Precision: {train_precision:.4f}")
print(f"Recall   : {train_recall:.4f}")
print(f"F1 Score : {train_f1:.4f}")

print("\nTest Metrics")
print(f"Accuracy : {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall   : {test_recall:.4f}")
print(f"F1 Score : {test_f1:.4f}")

Train Metrics
Accuracy : 0.9083
Precision: 0.8994
Recall   : 0.9193
F1 Score : 0.9093

Test Metrics
Accuracy : 0.8850
Precision: 0.8752
Recall   : 0.8980
F1 Score : 0.8865


In [16]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('model', LogisticRegression())
])

pipeline.fit(df["review"],df["sentiment"])

In [17]:
clean_text("This is a good movie")
pipeline.predict(["What"])

array(['negative'], dtype=object)

In [18]:
import joblib


joblib.dump(pipeline, "sentiment-1.pkl")

['sentiment-1.pkl']

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, Y_train)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Predict
Y_train_pred = rf.predict(X_train)
Y_test_pred = rf.predict(X_test)

# Train metrics
train_accuracy = accuracy_score(Y_train, Y_train_pred)
train_precision = precision_score(Y_train, Y_train_pred, pos_label="positive")
train_recall = recall_score(Y_train, Y_train_pred, pos_label="positive")
train_f1 = f1_score(Y_train, Y_train_pred, pos_label="positive")

# Test metrics
test_accuracy = accuracy_score(Y_test, Y_test_pred)
test_precision = precision_score(Y_test, Y_test_pred, pos_label="positive")
test_recall = recall_score(Y_test, Y_test_pred, pos_label="positive")
test_f1 = f1_score(Y_test, Y_test_pred, pos_label="positive")

# Print results
print("Train Metrics")
print(f"Accuracy : {train_accuracy:.4f}")
print(f"Precision: {train_precision:.4f}")
print(f"Recall   : {train_recall:.4f}")
print(f"F1 Score : {train_f1:.4f}")

print("\nTest Metrics")
print(f"Accuracy : {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall   : {test_recall:.4f}")
print(f"F1 Score : {test_f1:.4f}")

In [None]:
    from sklearn.ensemble import RandomForestClassifier # or RandomForestRegressor
    from sklearn.model_selection import GridSearchCV

In [None]:
  param_grid = {
      'n_estimators': [100, 200, 300],
      'max_depth': [None, 10, 20, 30],
      'min_samples_split': [2, 5, 10],
      'min_samples_leaf': [1, 2, 4],
      'bootstrap': [True, False]
  }

In [None]:
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(), # or RandomForestRegressor()
    param_grid=param_grid,
    cv=5, # Number of cross-validation folds
    scoring='accuracy', # or other relevant metric like 'f1', 'neg_mean_squared_error'
    n_jobs=-1 # Use all available CPU cores for parallel processing
)

In [None]:
grid_search.fit(X_train, Y_train)

In [None]:
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    best_estimator = grid_search.best_estimator_