# Imports

In [13]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from langdetect import detect, DetectorFactory

In [None]:
# Download once
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alex7\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\alex7\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\alex7\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# Load Data

In [3]:
df = pd.read_csv("./data/reviews.csv")
df.dropna(subset=['Review', 'Label'], inplace=True)
df['Review'] = df['Review'].astype(str)

# Preprocess Text

In [None]:
def clean_text(text):
    # clean out tags urls and only alphabet + no multiple spaces
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r"@\w+|#\w+", '', text)
    text = re.sub(r"[^a-zA-Z\s]", ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove stop words and lemmatize
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(w) for w in text.split() if w not in stop_words]
    return ' '.join(tokens)

In [5]:
df['processed'] = df['Review'].apply(clean_text)
df.head()

Unnamed: 0,Id,Review,Label,processed
0,0,good and interesting,5,good interesting
1,1,"This class is very helpful to me. Currently, I...",5,class helpful currently still learning class m...
2,2,like!Prof and TAs are helpful and the discussi...,5,like prof ta helpful discussion among student ...
3,3,Easy to follow and includes a lot basic and im...,5,easy follow includes lot basic important techn...
4,4,Really nice teacher!I could got the point eazl...,4,really nice teacher could got point eazliy v


In [None]:
DetectorFactory.seed = 0

def is_english(text):
    """Return True if detected language is English, False otherwise."""
    try:
        return detect(text) == 'en'
    except:
        # langdetect can fail on very short or weird text
        return False

In [7]:
df['is_english'] = df['Review'].apply(is_english)
df.head()

Unnamed: 0,Id,Review,Label,processed,is_english
0,0,good and interesting,5,good interesting,True
1,1,"This class is very helpful to me. Currently, I...",5,class helpful currently still learning class m...,True
2,2,like!Prof and TAs are helpful and the discussi...,5,like prof ta helpful discussion among student ...,True
3,3,Easy to follow and includes a lot basic and im...,5,easy follow includes lot basic important techn...,True
4,4,Really nice teacher!I could got the point eazl...,4,really nice teacher could got point eazliy v,True


In [8]:
print(df['is_english'].value_counts())

is_english
True    107018
Name: count, dtype: int64


In [9]:
def add_sentiment_column(df):
    sentiment_map = {
        1: 'negative',
        2: 'negative',
        3: 'neutral',
        4: 'positive',
        5: 'positive'
    }
    df['Sentiment'] = df['Label'].map(sentiment_map)
    return df


In [10]:
df = add_sentiment_column(df)
df.head()

Unnamed: 0,Id,Review,Label,processed,is_english,Sentiment
0,0,good and interesting,5,good interesting,True,positive
1,1,"This class is very helpful to me. Currently, I...",5,class helpful currently still learning class m...,True,positive
2,2,like!Prof and TAs are helpful and the discussi...,5,like prof ta helpful discussion among student ...,True,positive
3,3,Easy to follow and includes a lot basic and im...,5,easy follow includes lot basic important techn...,True,positive
4,4,Really nice teacher!I could got the point eazl...,4,really nice teacher could got point eazliy v,True,positive


In [15]:
le = LabelEncoder()
df['Sentiment_encoded'] = le.fit_transform(df['Sentiment'])

tfidf = TfidfVectorizer(
    max_features=20000,  
    ngram_range=(1,2),  
    stop_words='english'
)

X = tfidf.fit_transform(df['processed'])
y = df['Sentiment_encoded']