# Logistic Regression

Import the CSV in a pandas DF

In [32]:
import pandas as pd
imdb_data = pd.read_csv("IMDB Dataset.csv")
imdb_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


Convert the sentiment row into numeric values

In [33]:
imdb_data['sentiment'] = imdb_data['sentiment'].map({'positive': 1, 'negative': 0})

## Text Preprocessing

- Convert all the text to lowercase
- Remove the HTML embedded tags
- Remove punctuation and special characters
- Tokenize the words
- Remove the stopwords
- Lemmatize

In [47]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize WordNet lemmatizer and stopwords list
nltk.download('wordnet')
nltk.download('stopwords')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove punctuation, special characters, and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Tokenize
    words = text.split()
    
    # Remove stop words 
    words = [word for word in words if word not in stop_words]
    
    # Lemmatize
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Join words back into a sentence
    text = ' '.join(words)
    
    return text

# Apply preprocessing to the reviews
imdb_data['review'] = imdb_data['review'].apply(preprocess_text)
imdb_data.head()

[nltk_data] Downloading package wordnet to /Users/cgm/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/cgm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,review,sentiment
0,one reviewer ha mentioned watching oz episode ...,1
1,wonderful little production filming technique ...,1
2,thought wa wonderful way spend time hot summer...,1
3,basically family little boy jake think zombie ...,0
4,petter matteis love time money visually stunni...,1


In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Feature Extraction
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(imdb_data['review'])
y = imdb_data['sentiment']

# Logistic Regression
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)

# Model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Accuracy: 0.886
Precision: 0.8775905481309316
Recall: 0.8991863464973209
F1 Score: 0.8882572044697118
