### IMPORTING LIBRARIES


In [2]:
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
plt.style.use('ggplot')

from matplotlib.pyplot import figure

%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (10,6)
plt.rcParams["font.family"] = "DejaVu Sans"


# Read in the data file

df = pd.read_csv('movies.csv')


## CLEANING THE DATAFRAME


In [3]:
# Missing values percentage
df1=df.isna().mean()
print (df1*100) # 28% of missing values on 'budget' , 2% on 'gross' and 1% on 'rating'

name         0.000000
rating       1.004173
genre        0.000000
year         0.000000
released     0.026082
score        0.039124
votes        0.039124
director     0.000000
writer       0.039124
star         0.013041
country      0.039124
budget      28.312467
gross        2.464789
company      0.221701
runtime      0.052165
dtype: float64


### IMPUTING VALUES INTO NA WITH ALGORITHM

In [17]:
## Fixing missing values with MICE algorithm (Multiple Imputation by Chained Equations)
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(max_iter=10, random_state=42)
imputer.fit(df[['budget', 'gross']])

# Transform the data to impute missing values
df_imputed = imputer.transform(df[['budget', 'gross']])

# Convert the imputed data back to a DataFrame
df_imputed = pd.DataFrame(df_imputed, columns=['budget', 'gross'])

#print(df_imputed)

# Merge the imputed values back into the original dataset
df[['budget', 'gross']] = df_imputed

In [18]:
# Cheking if the imputal values are identical to the original values
original_values = df['budget'].dropna()
imputed_values = df_imputed['budget'].dropna()

difference = original_values - imputed_values

# Calculate the mean absolute error (MAE)
mae = difference.abs().mean()
print(mae) # = 0
# A mean absolute error (MAE) of 0 means that the imputed values are identical to
# the original values. This suggests that the IterativeImputer algorithm was able to 
# accurately estimate the missing values in the 'budget' column.

0.0


### DATA TYPES AND CLEANING


In [6]:
# Dropping NA values
df1=df.isna().mean()
print (df1*100) # 1% of missing values on 'rating'

df= df.dropna()

name        0.000000
rating      1.004173
genre       0.000000
year        0.000000
released    0.026082
score       0.039124
votes       0.039124
director    0.000000
writer      0.039124
star        0.013041
country     0.039124
budget      0.000000
gross       0.000000
company     0.221701
runtime     0.052165
dtype: float64


In [7]:
# Fixing Data Types
df['budget'] = df['budget'].astype('int64')
df['gross'] = df['gross'].astype('int64')

df.head()

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000,46998772,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000,58853106,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000,538375067,Lucasfilm,124.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000,83453539,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000,39846344,Orion Pictures,98.0


In [8]:
# Year column fitting with Release Year data

df_fixing= df['released'].astype(str).str.split(',')
Month= df_fixing.apply(lambda x: x[0]).str.strip()
Year= df_fixing.apply(lambda x: x[-1]).str.strip()
Year= Year.str.split(' ')
Year= Year.apply(lambda x: x[0]).str.strip()

# Cheking correct print(Year)

## Applying this columns to the original dataset

df['Month']= Month
df['Year']= Year

Month_fixed= df['Month'].str.split(' ')
New_Month= Month_fixed.apply(lambda x: x[0]).str.strip()
#print(Month_fixed)
Day = Month_fixed.apply(lambda x: x[-1]).str.strip()
df['Month']= New_Month
df['Day']= Day

## Aplying transformation into integer values 
month_map = {
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8,
    'September': 9,
    'October': 10,
    'November': 11,
    'December': 12
}

df['Month'] = df['Month'].map(month_map)

## Fixing NA Month Values 
df = df.dropna()

df['Month'] = df['Month'].astype('int64')


In [9]:
# Cleaning Day column
df = df.drop(df[df['Day'] == 'States)'].index)
df = df.drop(df[df['Day'] == 'Kingdom)'].index)
df = df.drop(df[df['Day'] == '(Australia)'].index)

In [10]:
# Changing Day column into integer
df['Day'] = df['Day'].astype('int64')

In [11]:
# Eliming the 'released' column
df= df.drop('released', axis=1)
df.head()

Unnamed: 0,name,rating,genre,year,score,votes,director,writer,star,country,budget,gross,company,runtime,Month,Year,Day
0,The Shining,R,Drama,1980,8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000,46998772,Warner Bros.,146.0,6,1980,13
1,The Blue Lagoon,R,Adventure,1980,5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000,58853106,Columbia Pictures,104.0,7,1980,2
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000,538375067,Lucasfilm,124.0,6,1980,20
3,Airplane!,PG,Comedy,1980,7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000,83453539,Paramount Pictures,88.0,7,1980,2
4,Caddyshack,R,Comedy,1980,7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000,39846344,Orion Pictures,98.0,7,1980,25


In [12]:
# Saving the clean dataset
df.to_csv('movies_clean.csv', index=False)

## TRANSFORMING THE DATAFRAME

### CLEANING AND NORMALIZATION OF THE DATA

In [13]:
# Read in the data file

df = pd.read_csv('movies_clean.csv')
df.drop('year', axis=1, inplace=True)

df_object = df.copy()
df_numeric = df.copy()


In [14]:
# Separating object and numeric columns
df_object= df.select_dtypes(include=['object'])
df_numeric= df.select_dtypes(include=['number'])

#### Normalization of Objects types Columns

In [15]:
# Importing libraries
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

le = LabelEncoder()
scaler= StandardScaler()

In [16]:
# Transforming object columns to a normalized form
def transform_columns(df, columns, transformer):
    for column in columns:
        df[column] = transformer.fit_transform(df[column])
    return df

df_object = transform_columns(df_object, df_object.columns, le)


df_object = pd.DataFrame(scaler.fit_transform(df_object), columns=df_object.columns)


df_numeric = pd.DataFrame(scaler.fit_transform(df_numeric), columns=df_numeric.columns)

df_normalized = pd.concat([df_object, df_numeric], axis=1)

df_normalized.to_csv('df_normalized.csv', index=False)