## RECOMENDATION SYSTEM : MACHINE LEARNING DEVELOPMENT

#### Import needed libraries

In [216]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
import unicodedata
from ast import literal_eval

#### Load dataset after EDA analysis

In [259]:
df=pd.read_csv("Dataset_ML/Dataset_ML_function.csv")

#### Setting features for ML training data

In [260]:
df.title=df.title.str.lower()## Normalize title to lowercase
df.genres=df.genres.str.replace(" ","")##Removing blank spaces
df.genres=df.genres.apply(literal_eval)## Setting correct datatype in column genres
df["features"]=df.genres

#### Getting a alphabet soup

In [None]:
for i in range(df.shape[0]):
    df.genres.iloc[i]=" ".join(df.genres.iloc[i])## Joining elements from genres list
for i in range(df.shape[0]):
    df.features.iloc[i]=df.genres.iloc[i]+" "+df.overview.iloc[i]## Joining elements from genres list and overview

In [262]:
df = df.sort_values("popularity", ascending=False)## Priority by popularity.. due an computational cost if whole dataset is used in ML model
df.drop(["genres","overview","popularity"],inplace=True,axis=1)## Remove columns which wont be used in ML model
df.features=df.features.str.lower()
df.reset_index(drop=True,inplace=True)## Normalize lower case features and reset index

#### Dataset view

In [231]:
df

Unnamed: 0,title,features
0,minions,despicable me collection family animation adve...
1,baby driver,action crime after being coerced into working...
2,big hero 6,adventure family animation action comedy the ...
3,deadpool,deadpool collection action adventure comedy de...
4,guardians of the galaxy vol. 2,guardians of the galaxy collection action adve...
...,...,...
39304,terror,"horror dr. vishal, a mad scientist, turns int..."
39305,altar of fire,documentary this film records a 12 day ritual...
39306,casual relations,drama in rappaportâ€™s dazzling and bizarre f...
39307,narrien illat,comedy drama music the ups and downs of the p...


#### Export dataset for ML training data

In [263]:
df.to_csv("Dataset_API/API7.csv",index=False)

### ML model development

#### Machine learning is often used in text analysis and TF-IDF helps in the categorization of data as well as the extraction of keywords. TF-IDF gives a solid grasp of essential words by Shubham Shankar.

In [264]:
df=pd.read_csv("Dataset_API/API7.csv")## Load dataset
ml = df.head(10000)## Use a sample due to excesive computational cost if whole dataset is used
ml.reset_index(inplace=True, drop=True)
ml.reset_index(inplace=True)## reset index


In [266]:
indices = ml[["title", "index"]]## Getting a dataset for finding new index
tfidf=TfidfVectorizer(stop_words="english",max_features=10000)## Setting tf-idf vector
tfidf_matrix=tfidf.fit_transform(ml["features"])## Setting tf-idf vectorizer with data
cosine_sim=linear_kernel(tfidf_matrix,tfidf_matrix)## Training model with the given data
 

#### API function ML model

In [271]:
def recomendacion(titulo):  # API function ML
    titulo = titulo.lower().strip()  # parameter string to lower
    titulo = unicodedata.normalize('NFKD', titulo).encode(
        'ascii', 'ignore').decode('utf-8', 'ignore')## Filter accent mark
    idx = indices[indices["title"]==titulo]## filter dataset with the parameter
    if idx.empty== True:## condition if dataset is empty
        recommendations=["No data available"]
    else:
        idy = idx["index"].iloc[0]## Search index
        sim_score = list(enumerate(cosine_sim[idy]))## Setting similarity
        sim_score = sorted(sim_score, key=lambda x: x[1], reverse=True)## Sorting results by score
        sim_score = sim_score[1:6]## Getting top score 5 movies
        movies_index = [i[0] for i in sim_score] ## Finding names
        recommendations = list(ml['title'].iloc[movies_index].str.title())## Making the list
    return {'titulo':titulo,'lista recomendada': recommendations}


In [272]:
recomendacion("toy story")  # testing functionality with an argument


{'titulo': 'toy story',
 'lista recomendada': ['Toy Story 3',
  'Toy Story 2',
  'The 40 Year Old Virgin',
  'Small Fry',
  'Man On The Moon']}