In [11]:
import pandas as pd

In [24]:
reviews_from_movies = pd.read_csv('../data/reviews_from_movies.csv')
reviews_from_users = pd.read_csv('../data/reviews_from_users.csv')
reviews_df = pd.concat([reviews_from_users, reviews_from_movies], ignore_index=True)

In [25]:
reviews_df.describe()

Unnamed: 0,rating,movie_year
count,102.0,102.0
mean,0.836275,2014.980392
std,0.167562,9.718799
min,0.3,1994.0
25%,0.8,2014.0
50%,0.8,2019.0
75%,1.0,2022.0
max,1.0,2023.0


In [26]:
# Remove duplicates
reviews_df.drop_duplicates(inplace=True)

# Drop rows with missing review_text or rating (or fill as appropriate)
reviews_df.dropna(subset=['review_text', 'rating'], inplace=True)

# Limit review text to 500 characters
reviews_df['review_text'] = reviews_df['review_text'].apply(lambda x: x[:500] if isinstance(x, str) else x)

In [27]:
#!pip install langdetect
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

# Ensures consistent results
DetectorFactory.seed = 0

# Function to detect language
def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"

In [28]:
# Add a language column
reviews_df['language'] = reviews_df['review_text'].apply(detect_language)

In [29]:
# Filter to keep only English reviews
reviews_df = reviews_df[reviews_df['language'] == 'en']

In [30]:
# Drop the language column if no longer needed
reviews_df.drop(columns=['language'], inplace=True)

In [31]:
reviews_df.describe()

Unnamed: 0,rating,movie_year
count,51.0,51.0
mean,0.845098,2017.117647
std,0.174715,8.056419
min,0.3,1994.0
25%,0.8,2016.0
50%,0.9,2019.0
75%,1.0,2022.5
max,1.0,2023.0


In [69]:
# Drop unwanted columns or reorder them
reviews_df = reviews_df[['review_text', 'rating']]
reviews_df.head()

Unnamed: 0,review_text,rating
0,monkey mondays #33,0.8
1,I mean...it's no Pride and Prejudice (2005) bu...,0.6
2,Addressed my inert fear of pink and pretty dre...,0.6
5,"it was good for the most part, couldn’t really...",0.7
7,"Well, I'm late to the bespoke party, but this ...",0.8


In [70]:
# Save the cleaned data
reviews_df.to_csv('../cleaned_reviews.csv', index=False)

In [79]:
#!pip install nltk
#!pip install contractions
import re
import nltk
import contractions
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Define a set of important words that should not be removed
important_words = {"no", "not", "but"}
stop_words = stop_words - important_words

def clean_text(text):
    # Expand contractions
    text = contractions.fix(text)
    
    # Lowercase text
    text = text.lower()
    
    # Replace specific punctuation (e.g., ellipses) with a space
    text = re.sub(r'\.\.\.+', ' ', text)
    
    # Remove all punctuation except for spaces
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stop words, but keep important context words
    cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    
    # Join tokens back into a sentence
    cleaned_text = ' '.join(cleaned_tokens)
    
    return cleaned_text

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [80]:
# Apply cleaning to the review_text column
reviews_df['cleaned_review'] = reviews_df['review_text'].apply(clean_text)

# Display the cleaned reviews
print(reviews_df[['review_text', 'cleaned_review']].head())

                                         review_text  \
0                                 monkey mondays #33   
1  I mean...it's no Pride and Prejudice (2005) bu...   
2  Addressed my inert fear of pink and pretty dre...   
5  it was good for the most part, couldn’t really...   
7  Well, I'm late to the bespoke party, but this ...   

                                      cleaned_review  
0                                   monkey monday 33  
1                   mean no pride prejudice 2005 but  
2  addressed inert fear pink pretty dress changed...  
5  good part could not really tell though near en...  
7  well late bespoke party but actually pretty go...  


In [83]:
reviews_df.iloc[3].review_text

'it was good for the most part, couldn’t really tell you though because near the end i just started putting airhead bites on my friend in the theater instead of paying attention'

In [84]:
reviews_df.iloc[3].cleaned_review

'good part could not really tell though near end started putting airhead bite friend theater instead paying attention'

In [85]:
reviews_df.iloc[3].rating

np.float64(0.7000000000000001)

In [90]:
# Define the thresholds for categories
def categorize_rating(rating):
    if rating <= 0.4:
        return 'bad'
    elif rating <= 0.7:
        return 'neutral'
    else:
        return 'good'

# Apply categorization to the rating column
reviews_df['rating_category'] = reviews_df['rating'].apply(categorize_rating)

# Display the updated dataframe
print(reviews_df[['rating', 'rating_category']].head())

   rating rating_category
0     0.8            good
1     0.6         neutral
2     0.6         neutral
5     0.7            good
7     0.8            good


In [91]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Convert categories to numerical values
reviews_df['rating_category_encoded'] = label_encoder.fit_transform(reviews_df['rating_category'])

# Display the updated dataframe with encoded categories
print(reviews_df[['rating_category', 'rating_category_encoded']].head())

  rating_category  rating_category_encoded
0            good                        1
1         neutral                        2
2         neutral                        2
5            good                        1
7            good                        1


In [112]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(reviews_df['cleaned_review'])

# Encode target variable
y = reviews_df['rating_category_encoded']

# Split data
X_train, X_test, y_train, y_test, X_train_raw, X_test_raw = train_test_split(
    X, y, reviews_df[['review_text']], test_size=0.5, random_state=421
)

# Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8461538461538461
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.85      1.00      0.92        22
           2       0.00      0.00      0.00         2

    accuracy                           0.85        26
   macro avg       0.28      0.33      0.31        26
weighted avg       0.72      0.85      0.78        26



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [113]:
# Create DataFrame with test data and predictions
test_df = X_test_raw.copy()
test_df['true_rating'] = y_test
test_df['predicted_rating'] = y_pred

# Display the DataFrame
print(test_df.head())

                                          review_text  true_rating  \
45                                 wdym they broke up            1   
21                                       eat the rich            1   
63  The first 1 and a half hour may seem boring, b...            1   
14  yk the movies good when it has sons! \nthe men...            1   
53                        the only movie that matters            1   

    predicted_rating  
45                 1  
21                 1  
63                 1  
14                 1  
53                 1  


In [114]:
test_df

Unnamed: 0,review_text,true_rating,predicted_rating
45,wdym they broke up,1,1
21,eat the rich,1,1
63,"The first 1 and a half hour may seem boring, b...",1,1
14,yk the movies good when it has sons! \nthe men...,1,1
53,the only movie that matters,1,1
66,History,1,1
71,no words (actually a shit ton im just lazy),1,1
69,If the Oscar’s had a character for over hyped ...,0,1
47,I think you can now choose this film as the mo...,2,1
49,guess i'll see you in the movies,1,1


In [105]:
reviews_df.loc[45]

review_text                wdym they broke up
rating                                    0.8
cleaned_review                     wdym broke
rating_category                          good
rating_category_encoded                     1
Name: 45, dtype: object

In [111]:
reviews_df.query('rating_category_encoded==2')

Unnamed: 0,review_text,rating,cleaned_review,rating_category,rating_category_encoded
1,I mean...it's no Pride and Prejudice (2005) bu...,0.6,mean no pride prejudice 2005 but,neutral,2
2,Addressed my inert fear of pink and pretty dre...,0.6,addressed inert fear pink pretty dress changed...,neutral,2
37,fell asleep but idk if the movie was boring or...,0.5,fell asleep but not know movie boring tired,neutral,2
47,I think you can now choose this film as the mo...,0.6,think choose film overrated film today eu acho...,neutral,2
87,Was kind of a snooze fest ngl,0.6,kind snooze fest ngl,neutral,2
