### Dataset Explaination

#### The Dataset consists of top 1000 rated movies
#### Columns Description:
##### 1) Title: describe the title of movie
##### 2) Certificate: describe the audience of this movie is for 
##### 3) Duration: describe the duration of each movie in minutes
##### 4) Genre: describe the movie type (action, drama ...etc.)
##### 5) Rate: describe the rating of the movie out of 10
##### 6) Metascore: describe the metascore of each movie
##### 7) Description: describe brief info about the movie
##### 8) Cast: describe the movie director and the lead actors
##### 9) Info: describe more info such as number of people voted and the movie gross

### Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import warnings
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from wordcloud import WordCloud
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
from mlxtend.frequent_patterns import apriori, association_rules

# to stop showing warnings 
warnings.filterwarnings('ignore') 

### Load  Dataset

In [None]:
movies = pd.read_csv("IMDB top 1000.csv") # read dataset
movies.info() # show brief info about dataset

### Clean and Preprocess Data

#### Clean Data

In [None]:
# drop unnecessary columns
movies = movies.drop(['Unnamed: 0','Metascore', 'Cast', 'Info'], axis=1)

# Use Simple Imputer to substitute null values in Certificate Column with 'Not Rated' 
Imputer = SimpleImputer(strategy='constant', fill_value='Not Rated')
movies['Certificate'] = Imputer.fit_transform(movies['Certificate'].values.reshape(-1,1))

movies.info() # show brief info about the dataset after cleaning

#### Preprocess Data

In [None]:
# preprocess supervised learning data frame

# select Description and Genre columns for supervised learning dataframe
df_supervised = movies[['Description','Genre']]

# Create new column for each genre in Genre column (Drama, Action, ...etc.)
df_supervised=df_supervised.join(pd.Series(df_supervised['Genre']).str.get_dummies(', '))

# drop Genre column
df_supervised=df_supervised.drop('Genre', axis=1)

In [None]:
# preprocess unsupervised learning data frame

# select Certificate, Duration, Rate, and Genre columns for unsupervised learning dataframe
df_unsupervised = movies[['Certificate','Duration','Rate','Genre']]

# encode Certificate column with the help of label encoder
df_unsupervised['Certificate'] = LabelEncoder().fit_transform(df_unsupervised['Certificate'])

# preprocess Duration column to get only numbers then change its type to integer
df_unsupervised['Duration'] = [re.sub("[^0-9]", "", x) for x in df_unsupervised['Duration']]
df_unsupervised['Duration'] = df_unsupervised['Duration'].astype('int64')

# Create new column for each genre in Genre column (Drama, Action, ...etc.)
df_unsupervised=df_unsupervised.join(pd.Series(df_unsupervised['Genre']).str.get_dummies(', '))

# drop Genre column
df_unsupervised=df_unsupervised.drop('Genre', axis=1)

### Data Exploration, Investigation and Visualization

In [None]:
df_supervised.info() # show brief info about supervised learning dataframe

In [None]:
df_unsupervised.info() # show brief info about unsupervised learning dataframe

In [None]:
# get top 10 rated movies
top_ranks = movies['Rate'].sort_values(ascending=False)[:10]

# get the names of top 10 rated movies
top_movies = movies.iloc[top_ranks.index,0].reset_index(drop=True)

# clearn the names of top 10 rated movies
top_movies = [re.sub("[^a-zA-z\s]", "", x).strip()  for x in top_movies]

# show top 10 rated movies
print("Top 10 Movies that have highest rating:")
for i, movie in enumerate(top_movies):
    print(i+1,movie)

In [None]:
# get genres names
genres = df_supervised.columns[1:]

# create genres movies counts list
genres_counts = [df_supervised[genre].value_counts()[1] for genre in genres]

# show bar plot Genres vs Count
fig = plt.figure(figsize=(30,8))
plt.bar(range(len(genres)),genres_counts, tick_label=genres)
plt.title('Genres vs Count', fontsize=18)
plt.ylabel('Count', fontsize=16)
plt.show()

In [None]:
# create wordcloud for top 10 rated movies
wordcloud = WordCloud(width = 1000, height = 800, random_state=21,
                background_color ='white',  
                min_font_size = 10).generate(' '.join(top_movies))

# show wordcloud image
fig = plt.figure(figsize=(25,6))
plt.imshow(wordcloud)
plt.axis('off')
plt.title('Top 10 Rated Movies', fontsize=16)
plt.show()

### PCA (Principle Component Analysis)

In [None]:
# scale data using standard scaler
scaled_data = StandardScaler().fit_transform(df_unsupervised)

# build an instance of PCA that keeps 3 principle components
pca = PCA(n_components=3)

# get pca data by performing PCA on scaled data
pca_data = pca.fit_transform(scaled_data)

In [None]:
# get the variance percentages
variance_percentage = pca.explained_variance_ratio_*100

# define principle components names
PC_names = [('PC'+str(i+1)) for i in range(len(variance_percentage))]

# show bar plot (variance percentage for each principle component)
fig = plt.figure(figsize=(7,5))
plt.bar(range(len(variance_percentage)), variance_percentage, tick_label=PC_names)
plt.xlabel('Principle Components', fontsize=16)
plt.ylabel('Variance Percentage', fontsize=16)
plt.show()

#### PCA 3D Visualization

In [None]:
# visualize data after PCA as 3D scatter plot
ax = plt.figure(figsize= (10,10)).gca(projection = '3d')
ax.scatter(pca_data[:,0],pca_data[:,1],pca_data[:,2])
ax.set(xlabel=' PC1',ylabel=' PC2',zlabel=' PC3')
plt.show()

### Supervised Learning

In [None]:
# separate supervised learning dataframe to descriptions and genres
descriptions = df_supervised['Description']
genres = df_supervised.drop('Description', axis=1)

In [None]:
# create clean text function that takes a piece of text and return clean version of it 
def clean_text(text):
    # separate text to sentences
    sentences = nltk.sent_tokenize(text)
    # loop through sentences
    for i in range(len(sentences)):
        sentences[i] = sentences[i].lower()  # make each sentence lower case
        sentences[i] = re.sub(r'\W', ' ', sentences[i])  # remove punctuations
        sentences[i] = re.sub('[0-9]', '', sentences[i])  # remove numbers
        sentences[i] = re.sub(r'\s+', ' ', sentences[i])  # remove multiple white spaces

    # load english stop words
    stop_words = list(stopwords.words('english'))
    # create clean text
    clean_text = ''
    # loop through sentences
    for sentence in sentences:
        # separate each sentence into words
        words = nltk.word_tokenize(sentence)
        # loop through the words
        for word in words:
            # if the word is not a stop word and it is not a single letter
            if (word not in stop_words) and len(word) >= 2:
                # add the word to clean text
                clean_text += word + ' '
    return clean_text

In [None]:
# clean descriptions
descriptions = descriptions.apply(clean_text)

# transfrom descriptions to vectors where we represent each word using its count
descriptions = CountVectorizer().fit_transform(descriptions).toarray()

# for debugging purposes
descriptions.shape

In [None]:
# create X(features) = descriptions
X = descriptions

# create y(targets or labels) = genres
y = genres.values

# for debugging purposes
X.shape, y.shape

In [None]:
# create Multi output classifier that use gaussian naive bayes estimator 
clf  = MultiOutputClassifier(GaussianNB(), n_jobs= -1)

# perform cross validation on the features and targets using defined classifier
scores = cross_val_score(clf, X, y, cv= 5, scoring='f1_weighted', n_jobs=-1)

# get cross validation mean score
f1_score = np.mean(scores) 

# show cross validation mean score 
print('Cross validation mean score is:', f1_score)

### Outlier Removal

In [None]:
# plot durations before outliers removal 
plt.scatter(df_unsupervised['Duration'], np.ones(df_unsupervised.iloc[:,1].shape))
plt.xlabel('Duration')

In [None]:
# get quartiles for Duration
Q1 = df_unsupervised['Duration'].quantile(0.25)
Q3 = df_unsupervised['Duration'].quantile(0.75)

# get interquartile range
IQR = Q3 - Q1

In [None]:
# remove durations that are less than Q1 - 3.5*IQR and durations bigger than  Q3 + 3.5*IQR
df_unsupervised = df_unsupervised[~((df_unsupervised < (Q1 - 3.5 * IQR)) |(df_unsupervised > (Q3 + 3.5 * IQR))).any(axis=1)]

# for debugging purposes
df_unsupervised.shape

In [None]:
# plot durations after outliers removal 
plt.scatter(df_unsupervised['Duration'], np.ones(df_unsupervised.iloc[:,1].shape))
plt.xlabel('Duration')

### Unsupervised Learning

#### Elbow Method to determine best number of cluster

In [None]:
# define ssd_to_center list(sum of squared distances to closest cluster center)
ssd_to_center = []

# define k values range that we want to try 
k_range = range(1,15)

for k in tqdm(k_range):
    kmeans = KMeans(n_clusters=k) # get an intasnce of kmeans with k = current k (change each iteration)
    kmeans.fit(df_unsupervised) # clustering
    ssd_to_center.append(kmeans.inertia_) # add sum of squared distances to sum of squared distances list

In [None]:
# plot elbow method graph
fig = plt.figure(figsize=(8,5))
plt.plot(k_range, ssd_to_center)
plt.scatter(k_range, ssd_to_center, c='r', marker='o')
plt.xlabel('k', fontsize=16)
plt.ylabel('Sum of Squared Distances', fontsize=16)
plt.title('Elbow Method', fontsize=20)

In [None]:
kmeans = KMeans(n_clusters=4) # get an instance of kmeans with  k = optimal observed k = 4

kmeans.fit(df_unsupervised) # clustering 

labels = kmeans.labels_ # get generated labels

centers = kmeans.cluster_centers_ # get clusters centers

In [None]:
# show data clusters and cluster centers plot
plt.scatter(df_unsupervised['Duration'],df_unsupervised['Rate'], c=labels)
plt.scatter(centers[:,1],centers[:,2],marker='*',c='r',s=300)
plt.title('Clusters Visualization')
plt.xlabel('Duration')
plt.ylabel('Rate')
plt.show()

### Association Rule Mining

In [None]:
# Getting frequent itemsets 
freq_itemsets=apriori(genres, min_support = 0.002,use_colnames=True)

# Getting Association Rules
rules_list = association_rules(freq_itemsets, metric ="lift", min_threshold = 4) 

# Sorting Association Rules
rules_list = rules_list.sort_values(['confidence', 'lift'], ascending =[False, False]).reset_index(drop=True)

In [None]:
# get only rules that have confidence higher than 0.9
filtered_rules = rules_list[(rules_list['confidence'] > 0.9)]

# show filtered rules
filtered_rules