In [1]:
import pandas as pd
import numpy as np
import random
import string
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score

### PRE-PROCESSING FUNCTIONS

In [2]:
# Function to generate a random string of 6 characters
def random_string(length=6):
    letters = string.ascii_lowercase
    return ''.join(random.choice(letters) for i in range(length))

In [3]:
''' 
Function that replaces the remaining null values 
(cannot be used for model training/testing)
'''
def replace_null_or_n(value, column):
    if pd.isnull(value) or value == r"\N":
        if column == 'genres':
            return random_string()
        elif column in ['titleType', 'primaryTitle', 'originalTitle']:
            return random_string()
        elif column == 'isAdult':
            return random.randint(0, 1)
        elif column == 'runtimeMinutes':
            return random.choice([100,150,200])
    return value

In [52]:
'''
    The function cleans, processes and encodes the columns of the dataset to make 
    it suitable for use in a machine learning model.
'''
def prepare_dataset_for_model(data):
    # Delete columns not required for the model
    data = data.drop(columns=['originalTitle', 'startYear', 'endYear', 'tconst'])
    
    # For the genres column we only take the first value
    data['genres'] = data['genres'].astype(str)
    data['genres'] = data['genres'].apply(lambda x: x.split(',')[0])

    # Encoding of values in string type columns
    categorial_columns = ['titleType', 'primaryTitle', 'genres']
    label_encoder = LabelEncoder()
    for column in categorial_columns:
        data[column] = label_encoder.fit_transform(data[column])
    
    # Filters the DataFrame keeping only rows where 'isAdult' is 0 or 1
    data = data[data['isAdult'].isin([0, 1, '0', '1'])]
    # Converts '0' and '1' values to numbers
    data['isAdult'] = data['isAdult'].apply(lambda x: int(x))

    # For special cases not already converted
    # Convert all DataFrame values to numeric, forcing non-numeric values to NaN
    data_numeric = data.apply(pd.to_numeric, errors='coerce')
    # Deletes lines containing NaN
    data = data_numeric.dropna()

    unique_genres = data['isAdult'].unique()
    print(f"Unique value in isAdult: {unique_genres}")
    
    print(f"Number of rows and columns: {data.shape}")
    print("First five rows:")
    print(data.head(5))
    return data


### EVALUATION FUNCTIONS

In [37]:
''' 
    This function uses the data passed into the input dataframe for training and testing a 
    random forest classifier and extracts its accuracy and f1 score metrics.
'''
def model_training_for_profiling_evaluation(data, target, features):
    X = data[features]
    y = data[target]
    
    print(f"Feature data shape: {X.shape}")
    print(f"Target data shape: {y.shape}")
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Training set size: {X_train.shape}")
    print(f"Test set size: {X_test.shape}")
    print("Training of the model...")
    # Model used: RandomForestClassifier
    model = Pipeline(steps=[
        ('classifier', RandomForestClassifier(random_state=42))
    ])
 
    model.fit(X_train, y_train)
    print("The model was trained.")
    
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    print(f'Accuracy: {accuracy}')
    print(f'F1 Score: {f1}')
    

### DIRTY TITLES DATASET

In [39]:
# Data loading
dirty_data = pd.read_csv('../../datasets/initial_datasets/dirty_titles.csv', on_bad_lines='skip')
# Sample of 1.000.000 of rows
dirty_data = dirty_data.tail(1000000)

  dirty_data = pd.read_csv('../../datasets/initial_datasets/dirty_titles.csv', on_bad_lines='skip')


In [40]:
# Applies the function relace_null_or_n to all columns of the DataFrame
for column in dirty_data.columns:
    dirty_data[column] = dirty_data[column].apply(lambda x: replace_null_or_n(x, column))

In [41]:
dirty_data = prepare_dataset_for_model(dirty_data)

Unique value in isAdult: [1 0]
Number of rows and columns: (1000000, 5)
First five rows:
          titleType  primaryTitle  isAdult  runtimeMinutes  genres
63027488     850866        895795        1             100  934085
63027489     738548        522463        1             200  706907
63027490     631715        786311        1             100  294350
63027491     444389         31395        1             100  561179
63027492      41950        336227        1             200  726894


In [42]:
features = ['titleType', 'primaryTitle', 'runtimeMinutes', 'genres']
target = 'isAdult'
model_training_for_profiling_evaluation(dirty_data, target, features)

Feature data shape: (1000000, 4)
Target data shape: (1000000,)
Training set size: (800000, 4)
Test set size: (200000, 4)
Training of the model...
The model was trained.
Accuracy: 0.50047
F1 Score: 0.5003204493184108


### CLEANING TITLES DATASET

In [43]:
# Data loading
cleaning_data = pd.read_csv('../../datasets/cleaned_datasets/cleaned_titles.csv', on_bad_lines='skip')
# Sample of 1.000.000 of rows
cleaning_data = cleaning_data.tail(1000000)

In [44]:
# Applies the function relace_null_or_n to all columns of the DataFrame
for column in dirty_data.columns:
    cleaning_data[column] = cleaning_data[column].apply(lambda x: replace_null_or_n(x, column))

In [53]:
cleaning_data = prepare_dataset_for_model(cleaning_data)

Unique value in isAdult: [0 1]
Number of rows and columns: (999870, 5)
First five rows:
         titleType  primaryTitle  isAdult  runtimeMinutes  genres
9667599          2        321032        0             100       8
9667600          2        224726        0             100       8
9667601          2        160042        0             200       8
9667602          2        120128        0             150      24
9667603          0        489343        0             100      12


In [54]:
features = ['titleType', 'primaryTitle', 'runtimeMinutes', 'genres']
target = 'isAdult'
model_training_for_profiling_evaluation(cleaning_data, target, features)

Feature data shape: (999870, 4)
Target data shape: (999870,)
Training set size: (799896, 4)
Test set size: (199974, 4)
Training of the model...
The model was trained.
Accuracy: 0.9993849200396051
F1 Score: 0.9993832974720626
