In [103]:
import numpy as np
import pandas as pd

df = pd.read_csv('Shows.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,ID,Name,Year,Rating,Duration,Reviews,Director,Genre,Link
0,0,tt5491994,Planet Earth II,2016,9.5,4h 58m,139,David Attenborough|Chadden Hunter|Elizabeth White,Documentary,https://www.imdb.com/title/tt5491994/
1,1,tt0795176,Planet Earth,2006,9.4,8h 58m,100,David Attenborough|Sigourney Weaver|Nikolay Dr...,Documentary,https://www.imdb.com/title/tt0795176/
2,2,tt0903747,Breaking Bad,2008,9.4,49m,4K,Bryan Cranston|Aaron Paul|Anna Gunn,Crime|Drama|Thriller,https://www.imdb.com/title/tt0903747/
3,3,tt0185906,Band of Brothers,2001,9.4,9h 54m,929,Scott Grimes|Damian Lewis|Ron Livingston,Drama|History|War,https://www.imdb.com/title/tt0185906/
4,4,tt7366338,Chernobyl,2019,9.3,5h 30m,3.3K,Jessie Buckley|Jared Harris|Stellan Skarsgård,Drama|History|Thriller,https://www.imdb.com/title/tt7366338/


### 1. Removing Unwanted Columns

In [104]:
del df['Unnamed: 0']
del df['ID']
del df['Rating']
del df['Duration']
del df['Reviews']
del df['Link']
del df['Year']

df.head()

Unnamed: 0,Name,Director,Genre
0,Planet Earth II,David Attenborough|Chadden Hunter|Elizabeth White,Documentary
1,Planet Earth,David Attenborough|Sigourney Weaver|Nikolay Dr...,Documentary
2,Breaking Bad,Bryan Cranston|Aaron Paul|Anna Gunn,Crime|Drama|Thriller
3,Band of Brothers,Scott Grimes|Damian Lewis|Ron Livingston,Drama|History|War
4,Chernobyl,Jessie Buckley|Jared Harris|Stellan Skarsgård,Drama|History|Thriller


### 2. Dealing with Null Values

In [105]:
df = df[df['Genre'] != 'Nan']

df.head()

Unnamed: 0,Name,Director,Genre
0,Planet Earth II,David Attenborough|Chadden Hunter|Elizabeth White,Documentary
1,Planet Earth,David Attenborough|Sigourney Weaver|Nikolay Dr...,Documentary
2,Breaking Bad,Bryan Cranston|Aaron Paul|Anna Gunn,Crime|Drama|Thriller
3,Band of Brothers,Scott Grimes|Damian Lewis|Ron Livingston,Drama|History|War
4,Chernobyl,Jessie Buckley|Jared Harris|Stellan Skarsgård,Drama|History|Thriller


### 3. Applying Appriori on Genre
##### 3.1) Finding Unique Genre

In [106]:
genre = []

for i in df['Genre']:
    genre += i.split('|')
    
genre = list(set(genre))                # Removing Duplicates

##### 3.2) Finding the Occurence Percentage

In [107]:
data = []

for gen in genre:                # Going through each Unique Genre
    c = 0                        # For Counting the Movies in specific Genre
    for i in df['Genre']:        # Going through Genre Column
        if (gen in i):           # Checking if Specific genre is there
            c+=1                 # Incrementing the variable of movies with specific genre
            
    data.append([gen,int((c/len(df))*100)])
    
data = pd.DataFrame(data, columns = ['Genre','Percentage'])
data.head()

Unnamed: 0,Genre,Percentage
0,Music,2
1,Fantasy,2
2,Sport,1
3,History,7
4,News,0


##### 3.3) Analysing the Occurence Percentage

In [108]:
data.sort_values(by = 'Percentage', ascending = False).head(10)

Unnamed: 0,Genre,Percentage
21,Drama,54
10,Comedy,31
12,Action,24
17,Crime,23
15,Animation,21
18,Adventure,20
23,Mystery,11
19,Documentary,9
3,History,7
9,Thriller,6


##### 3.4) Apply Threshold of 20%

In [109]:
data = data[data['Percentage'] >= 20]

##### 3.5) Finding the Combinations or Pairs

In [110]:
combinations = []

shortlisted_gen = list(data['Genre'])

for i in range(len(shortlisted_gen)):
    for j in range(i,len(shortlisted_gen)):
        if (i != j):
            combinations.append([shortlisted_gen[i],shortlisted_gen[j]])    # All the Possible Combinations
            
combinations

[['Comedy', 'Action'],
 ['Comedy', 'Animation'],
 ['Comedy', 'Crime'],
 ['Comedy', 'Adventure'],
 ['Comedy', 'Drama'],
 ['Action', 'Animation'],
 ['Action', 'Crime'],
 ['Action', 'Adventure'],
 ['Action', 'Drama'],
 ['Animation', 'Crime'],
 ['Animation', 'Adventure'],
 ['Animation', 'Drama'],
 ['Crime', 'Adventure'],
 ['Crime', 'Drama'],
 ['Adventure', 'Drama']]

##### 3.5) Finding the Series with Specific Combination of Genre

In [111]:
data_combinations = []

for combination in combinations:                                                  # Going through each Combination
    c = 0
    for i in df['Genre']:                                                         # Going through while dataset
        if( (combination[0] in i.split('|')) and (combination[1] in i.split('|')) ):
            c += 1                                                                # Count the Frequencies
    
    data_combinations.append([combination, int((c/len(df))*100)])                 # Adding the data in a list
    

data_combinations = pd.DataFrame(data_combinations , columns = ['Genre','Percentage'])    # Creating the DataFrame
data_combinations = data_combinations.sort_values(by = 'Percentage', ascending = False)   # Sorting the DataFrame

##### 3.6) Apply Threshold of 10%

In [112]:
data_combinations = data_combinations[data_combinations['Percentage'] >= 10]

data_combinations

Unnamed: 0,Genre,Percentage
13,"[Crime, Drama]",20
5,"[Action, Animation]",16
7,"[Action, Adventure]",16
10,"[Animation, Adventure]",14
4,"[Comedy, Drama]",11


##### 3.7) Printing Final Combinations

In [113]:
for i in data_combinations['Genre']:
    print(','.join(i))

Crime,Drama
Action,Animation
Action,Adventure
Animation,Adventure
Comedy,Drama


### 4. Applying Appriori with Different Functions

In [187]:
def unique(df,column,sep):                                # Finding Unique Categories
    categories = []
    for i in df[column]:
        categories += i.split(sep) 
    return list(set(categories))


def finding_freq_single(df,column,categories):            # Finding the frequency of each category (Single)
    
    data = []
    for category in categories:     
        c = 0                       
        for i in df[column]:        
            if (category in i):   
                c+=1                
        data.append([category, (c/len(df))*100])
    return pd.DataFrame(data, columns = [column,'Percentage'])


def apply_threshold(df, column, ascend, threshold):       # Applying the Threshold
    df = df[df[column] >= threshold]
    return df.sort_values(by = column, ascending = ascend)


def finding_combinations(df, column):                    # Finding the Combinations
    combinations = []
    shortlisted = list(df[column])
    for i in range(len(shortlisted)):
        for j in range(i,len(shortlisted)):
            if (i != j):
                combinations.append([shortlisted[i],shortlisted[j]])
    return combinations


def finding_freq_double(df, column, combinations, sep):  # Finding the frequency of each category (Doubles)
    data_combinations = []
    for combination in combinations:                                                 
        c = 0
        for i in df[column]:                                                        
            if( (combination[0] in i.split('|')) and (combination[1] in i.split(sep)) ):
                c += 1                                                               
        data_combinations.append([combination, (c/len(df))*100])                
    data_combinations = pd.DataFrame(data_combinations , columns = [column,'Percentage']) 
    data_combinations = data_combinations.sort_values(by = 'Percentage', ascending = False)
    return data_combinations

##### Calling the Function

In [179]:
directors = unique(df , 'Director' , '|')
directors_data = finding_freq_single( df , 'Director' , directors)
directors_data = apply_threshold(directors_data, 'Percentage', False, .5)
combinations = finding_combinations(directors_data, 'Director')
finding_freq_double(df, 'Director', combinations, '|').head(10)

Unnamed: 0,Director,Percentage
40,"[David Attenborough, Chadden Hunter]",0.884956
51,"[Jeremy Clarkson, James May]",0.884956
603,"[Masako Nozawa, Jôji Yanami]",0.884956
87,"[Jeremy Clarkson, Richard Hammond]",0.884956
302,"[James May, Richard Hammond]",0.884956
353,"[Edward James Olmos, Mary McDonnell]",0.884956
549,"[John DiMaggio, Katey Sagal]",0.442478
396,"[Brice Armstrong, Jôji Yanami]",0.442478
878,"[Roger Munns, John Hurt]",0.442478
955,"[Lee Tergesen, J.K. Simmons]",0.442478


### 5. Applying Appriori with Single Function

In [189]:
def apriori(df, column, sep, thres, asc, count):        # Applying Apriori Algorithm
    
    categories = unique(df , column , sep)                                         # Finding Unique Values
    categories_data = finding_freq_single( df , column , categories)               # Finding the Frequency
    categories_data = apply_threshold(categories_data, 'Percentage', asc, thres)   # Applying the Threshold  
    combinations = finding_combinations(categories_data, column)                   # Finding the Combinations
    return  finding_freq_double(df, column, combinations, sep).head(count)         # Finding the Frequency of Pairs

apriori(df, 'Director', '|', .5, True, 10)

Unnamed: 0,Director,Percentage
42,"[James May, Jeremy Clarkson]",0.884956
37,"[James May, Richard Hammond]",0.884956
1082,"[Chadden Hunter, David Attenborough]",0.884956
196,"[Edward James Olmos, Mary McDonnell]",0.884956
105,"[Masako Nozawa, Jôji Yanami]",0.884956
1087,"[Richard Hammond, Jeremy Clarkson]",0.884956
963,"[Timothy Olyphant, Nick Searcy]",0.442478
805,"[Derek Jacobi, John Hurt]",0.442478
672,"[Zach Aguilar, Natsuki Hanae]",0.442478
1094,"[Naveen Kasturia, Jitendra Kumar]",0.442478


### Process of Apriori

m1 = [g1,g2]

m2 = [g1,g3]

m3 = [g1,g2]

m4 = [g2,g3]

m5 = [g1,g2]

------------
Finding the Freq


g1 = 4/5 = 80%

g2 = 3/5 = 60%

g3 = 2/5 = 40%

-----
Minimum Support 30%

g1 = 4/5 = 80%

g2 = 3/5 = 60%

g3 = 2/5 = 40%

---
Finding Combinations

{g1,g2} = 3/5 = 60%

{g1,g3} = 1/5 = 20%

{g2,g3} = 1/5 = 20%

---
Threshold Confidence 30%

{g1,g2}