In [1]:
import pandas as pd
import json

In [2]:
# Load the JSON file
with open('movies.json', 'r') as file:
    data = json.load(file)

# Convert to DataFrame
df = pd.DataFrame(data)

In [3]:
# Display the first few rows of the DataFrame
print(df.head())

                    name  year  rating                      genre
0  2001: A Space Odyssey  1968  7.5/10          [Science Fiction]
1     A Clockwork Orange  1971  8.4/10     [Crime, Drama, Sci-Fi]
2             Adam's Rib  1949  7.6/10   [Comedy, Drama, Romance]
3                Airport  1970  6.6/10  [Action, Drama, Thriller]
4                  Alien  1979    6/10          [Science Fiction]


In [4]:
# Check for missing values
print(df.isnull().sum())

# Drop rows with missing values (if any)
df.dropna(inplace=True)

name      0
year      0
rating    2
genre     0
dtype: int64


In [5]:
# Convert ratings to numeric and create target column
df['rating'] = df['rating'].str.replace('/10', '').astype(float)
df['liked'] = (df['rating'] >= 7).astype(int)

In [6]:
# Select features and target
features = df[['year', 'genre']]
target = df['liked']

In [7]:
# One-hot encode the genre
features = features.join(df['genre'].str.get_dummies(sep=', '))

In [8]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
features[['year']] = scaler.fit_transform(features[['year']])

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [11]:
# One-hot encode the genre
genre_dummies = df['genre'].apply(lambda x: pd.Series(1, index=x)).fillna(0).astype(int)

# Combine the one-hot encoded genres with the original DataFrame
df = pd.concat([df, genre_dummies], axis=1)

# Drop the original genre column
df.drop(columns=['genre'], inplace=True)

In [12]:
# Select features and target
features = df.drop(columns=['liked', 'rating'])  # Exclude target and rating
target = df['liked']

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [15]:
# Check the data types of the features
print(X_train.dtypes)

name               object
year                int64
Science Fiction     int32
Crime               int32
Drama               int32
Sci-Fi              int32
Comedy              int32
Romance             int32
Action              int32
Thriller            int32
War                 int32
Musical             int32
Adventure           int32
History             int32
Biography           int32
Horror              int32
Mystery             int32
Film-Noir           int32
Fantasy             int32
Animation           int32
Family              int32
Western             int32
Music               int32
Sport               int32
Documentary         int32
dtype: object


In [16]:
# Drop non-numeric columns, e.g., 'name' if it exists
X_train = X_train.drop(columns=['name'], errors='ignore')
X_test = X_test.drop(columns=['name'], errors='ignore')

In [22]:
# One-hot encode the genre if not done yet
if 'genre' in df.columns:
    genre_dummies = df['genre'].apply(lambda x: pd.Series(1, index=x)).fillna(0).astype(int)
    df = pd.concat([df, genre_dummies], axis=1)
    df.drop(columns=['genre'], inplace=True)

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [27]:
print(X_train.columns)

Index(['name', 'year', 'Science Fiction', 'Crime', 'Drama', 'Sci-Fi', 'Comedy',
       'Romance', 'Action', 'Thriller', 'War', 'Musical', 'Adventure',
       'History', 'Biography', 'Horror', 'Mystery', 'Film-Noir', 'Fantasy',
       'Animation', 'Family', 'Western', 'Music', 'Sport', 'Documentary'],
      dtype='object')


In [29]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# Identify different types of columns
object_columns = ['name']
year_column = ['year']
genre_columns = X_train.columns.drop(['name', 'year']).tolist()

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('name_encoder', OneHotEncoder(handle_unknown='ignore'), object_columns),
        ('year_scaler', StandardScaler(), year_column),
        ('genre_passthrough', 'passthrough', genre_columns)
    ])

# Create a pipeline with preprocessor and model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Fit the pipeline
pipeline.fit(X_train, y_train)

In [37]:
# One-hot encode the genre
genre_dummies = df['genre'].apply(lambda x: pd.Series(1, index=x)).fillna(0).astype(int)

# Combine the one-hot encoded genres with the original DataFrame
df = pd.concat([df, genre_dummies], axis=1)

# Drop the original genre column
df.drop(columns=['genre'], inplace=True)

In [39]:
print(df['rating'].dtype)

float64


In [40]:
# Remove '/10' if present, convert to float, and handle any errors
df['rating'] = pd.to_numeric(df['rating'].astype(str).str.replace('/10', ''), errors='coerce')

In [41]:
# Drop rows with NaN values in the rating column
df.dropna(subset=['rating'], inplace=True)

In [42]:
# Create target variable
df['liked'] = (df['rating'] >= 7).astype(int)

In [43]:
# Prepare features and target
X = df.drop(columns=['liked', 'rating'])
y = df['liked']

In [44]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [46]:
# Train the model
print(X_train.dtypes)
print(X_train.head())

name               object
year                int64
Science Fiction     int32
Crime               int32
Drama               int32
Sci-Fi              int32
Comedy              int32
Romance             int32
Action              int32
Thriller            int32
War                 int32
Musical             int32
Adventure           int32
History             int32
Biography           int32
Horror              int32
Mystery             int32
Film-Noir           int32
Fantasy             int32
Animation           int32
Family              int32
Western             int32
Music               int32
Sport               int32
Documentary         int32
dtype: object
                                    name  year  Science Fiction  Crime  Drama  \
54                          Frankenstein  1931                0      0      1   
299  The Curious Case of Benjamin Button  2008                0      0      1   
503              Star Trek Into Darkness  2013                0      0      0   
812         

In [47]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# Define column types
name_column = ['name']
year_column = ['year']
genre_columns = X_train.columns.drop(['name', 'year']).tolist()

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('name_encoder', OneHotEncoder(handle_unknown='ignore'), name_column),
        ('year_scaler', StandardScaler(), year_column),
        ('genre_passthrough', 'passthrough', genre_columns)
    ])

# Create a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Fit the pipeline
pipeline.fit(X_train, y_train)

In [48]:
y_pred = pipeline.predict(X_test)

In [49]:
trained_model = pipeline.named_steps['classifier']
coefficients = trained_model.coef_

In [51]:
# Extract the trained LogisticRegression model from the pipeline
# Get the coefficients
coefficients = trained_model.coef_

# Get the intercept
intercept = trained_model.intercept_

# Get the classes
classes = trained_model.classes_

In [52]:
# Extract the preprocessor
preprocessor = pipeline.named_steps['preprocessor']

# Preprocess the data
X_test_preprocessed = preprocessor.transform(X_test)

# Make predictions using the extracted model
predictions = trained_model.predict(X_test_preprocessed)

# Get prediction probabilities
prediction_probs = trained_model.predict_proba(X_test_preprocessed)

In [53]:
from sklearn.metrics import accuracy_score, classification_report

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

# Generate a classification report
report = classification_report(y_test, predictions)
print("Classification Report:")
print(report)

Accuracy: 0.7379679144385026
Classification Report:
              precision    recall  f1-score   support

           0       0.25      0.04      0.08        45
           1       0.76      0.96      0.85       142

    accuracy                           0.74       187
   macro avg       0.50      0.50      0.46       187
weighted avg       0.64      0.74      0.66       187



In [69]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load the dataset
df = pd.read_json('movies.json')

# Ensure the 'liked' column is present or create a placeholder
if 'liked' not in df.columns:
    df['liked'] = np.random.choice([0, 1], size=len(df))

# Handle and clean the 'genre' column
df['genre'] = df['genre'].apply(lambda x: x.split(', ') if isinstance(x, str) else [])
df['genre'] = df['genre'].apply(lambda x: ', '.join(x) if x else 'unknown')  # Replace empty lists with 'unknown'

# Ensure 'name' column is non-empty
df = df[(df['name'].str.strip() != '') & (df['genre'].str.strip() != '')]

# Prepare features and target variable
X = df[['name', 'year', 'rating', 'genre']]
y = df['liked']

# Check and convert data types
print(X.dtypes)

# Define transformers
text_transformer = TfidfVectorizer()
genre_transformer = TfidfVectorizer()  # TF-IDF for genre

# Define the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, 'name'),  # Process the 'name' column
        ('genre', genre_transformer, 'genre'),  # Process the 'genre' column
        ('num', 'passthrough', ['year', 'rating'])  # Pass through numeric columns
    ],
    remainder='drop'  # Drop other columns
)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Define hyperparameters for Grid Search
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10]
}

# Grid search
grid_search = GridSearchCV(model_pipeline, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best model from grid search
best_model = grid_search.best_estimator_

# Save the best model
joblib.dump(best_model, 'best_model_pipeline.pkl')

# Load the best model
loaded_model = joblib.load('best_model_pipeline.pkl')

# Predict on the test set
predictions = loaded_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.4f}")

# Generate a classification report
report = classification_report(y_test, predictions)
print("Classification Report:")
print(report)


name      object
year       int64
rating    object
genre     object
dtype: object
Fitting 5 folds for each of 36 candidates, totalling 180 fits


ValueError: 
All the 180 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\azizb\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\compose\_column_transformer.py", line 1111, in _hstack
    check_array(X, accept_sparse=True, force_all_finite=False)
  File "c:\Users\azizb\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\validation.py", line 1012, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\azizb\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\_array_api.py", line 751, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\azizb\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: '7.5/10'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "c:\Users\azizb\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\azizb\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\azizb\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\azizb\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\pipeline.py", line 406, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\azizb\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\azizb\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\pipeline.py", line 1310, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\azizb\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\_set_output.py", line 313, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\azizb\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\azizb\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\compose\_column_transformer.py", line 1006, in fit_transform
    return self._hstack(list(Xs), n_samples=n_samples)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\azizb\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\compose\_column_transformer.py", line 1115, in _hstack
    raise ValueError(
ValueError: For a sparse output, all columns should be a numeric or convertible to a numeric.
