In [69]:
import pandas as pd
import numpy as np

In [70]:
# Import the library to mount Google Drive
from google.colab import drive

# Mount the Google Drive at /content/drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [71]:
# Define the file path
file_path = '/content/drive/My Drive/IMDB Dataset.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame
print(df.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


**Converting Text to Lower case**

In [72]:
df['review']=df['review'].str.lower()
mapping = {'positive': 1, 'negative': 0}
df['sentiment'] = df['sentiment'].map(mapping)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production. <br /><br />the...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there's a family where a little boy ...,0
4,"petter mattei's ""love in the time of money"" is...",1


**Remove HTML Tags**

In [73]:
import re
def remove_html_tags(text):
  pattern=re.compile("<.*?>")
  return pattern.sub(r"",text)

In [74]:
df['review']=df["review"].apply(remove_html_tags)

**Remove Punctuations**

In [75]:
import string

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

In [76]:
df["review"]=df["review"].apply(remove_punctuation)

**Handling Chat words**

In [77]:
# Here Come ChatWords Which i Get from a Github Repository
# Repository Link : https://github.com/rishabhverma17/sms_slang_translator/blob/master/slang.txt
chat_words = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "B4N": "Bye For Now",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you (also a chat program)",
    "ILU": "ILU: I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laugh My A.. Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A..",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA?": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F...",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laugher",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I don't care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "ZZZ": "Sleeping, bored, tired",
    "WYWH": "Wish you were here",
    "TIME": "Tears in my eyes",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "BFF": "Best friends forever",
    "CSL": "Can't stop laughing"
}

In [78]:
# Function
def chat_conversion(text):
    new_text = []
    for i in text.split():
        if i.upper() in chat_words:
            new_text.append(chat_words[i.upper()])
        else:
            new_text.append(i)
    return " ".join(new_text)

In [79]:
df["review"]=df["review"].apply(chat_conversion)

**Removing of Stop Words**

In natural language processing (NLP), stop words are commonly used words that are filtered out during text processing because they don't carry much meaning and can hinder the performance of certain NLP tasks. Examples include articles (like "the," "a," "an"), prepositions (like "in," "on," "at"), and conjunctions (like "and," "but," "or")

In [80]:
!pip install nltk
import nltk
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [81]:
# We use NLTK library to remove Stopwords.
from nltk.corpus import stopwords
# Here we can see all the stopwords in English.
stopword = stopwords.words('english')
# Function
def remove_stopwords(text):
    new_text = []

    for word in text.split():
        if word in stopword:
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [82]:
# We can Apply the same Function on Whole Corpus also
df['review']=df['review'].apply(remove_stopwords)

**Handling Emojies**

In [83]:
# Again Here we use The Regular Expressions to Remove the Emojies from Text or Whole Corpus.
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [84]:
df['review']=df['review'].apply(remove_emoji)

# **Lemmatization and Stemming Difference**
**Stemming** is a simpler, rule-based method that **chops off word endings** to get to the root, which may not always result in a valid word.

**Lemmatization**, on the other hand, is a more sophisticated approach that considers the context and uses a vocabulary (dictionary) to convert words to their base or **dictionary form**, known as the lemma, which is always a valid word.

**Lemmatization**

In [18]:
from nltk.stem import WordNetLemmatizer

# Make sure you have the necessary NLTK resources
nltk.download('wordnet')
nltk.download('omw-1.4')


# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Sample lemmatization function
def lemmatize_text(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

# Apply lemmatization to your dataset
df['review'] = df['review'].apply(lemmatize_text)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


**Stemming**

In [88]:
from nltk.stem import PorterStemmer

# Initialize stemmer
stemmer = PorterStemmer()

# Sample stemming function
def stem_text(text):
    return " ".join([stemmer.stem(word) for word in text.split()])
# Apply stemming to your dataset
df['review'] = df['review'].apply(stem_text)

In [41]:
df.to_csv("output.csv")

In [42]:
df2=pd.read_csv("/content/output.csv")

In [89]:
from sklearn.model_selection import train_test_split

X=df["review"]
y=df["sentiment"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**TF-IDF**

In [90]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TfidfVectorizer
vectorizer = TfidfVectorizer()

# Keep the data in sparse format, no need to convert it to an array
X_train = vectorizer.fit_transform(X_train)  # Features
y_train = y_train  # Target

X_test = vectorizer.transform(X_test)
y_test=y_test

In [91]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

svm_model = LinearSVC(random_state=42)
svm_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test)

# Evaluate the SVM model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print evaluation results
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_rep)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.8954
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.88      0.89      4961
           1       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

Confusion Matrix:
[[4373  588]
 [ 458 4581]]


**Logistic Regression**

In [92]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Create and fit the Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print('Classification Report:')
print(classification_report(y_test, y_pred))

Accuracy: 0.89
Confusion Matrix:
[[4336  625]
 [ 461 4578]]
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.87      0.89      4961
           1       0.88      0.91      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



Tried on both Lemmatization and stemming they both are giving almost same accuracy.

In [66]:
#from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TfidfVectorizer with n-gram support (unigrams + bigrams)
#vectorizer = TfidfVectorizer(
    #ngram_range=(1, 2),        # unigram + bigram
    #stop_words='english',      # remove common stopwords
    #max_features=5000          # limit number of features to avoid overfitting
#)

# Fit on training text and transform both training and test sets
#X_train = vectorizer.fit_transform(X_train)
#X_test = vectorizer.transform(X_test)