In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
pd.set_option("display.max_columns",100)

In [4]:
df = pd.read_csv("https://query.data.world/s/uikepcpffyo2nhig52xxeevdialfl7")
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,Language,Country,Awards,Poster,Ratings.Source,Ratings.Value,Metascore,imdbRating,imdbVotes,imdbID,Type,tomatoMeter,tomatoImage,tomatoRating,tomatoReviews,tomatoFresh,tomatoRotten,tomatoConsensus,tomatoUserMeter,tomatoUserRating,tomatoUserReviews,tomatoURL,DVD,BoxOffice,Production,Website,Response
0,1,The Shawshank Redemption,1994,R,14 Oct 1994,142 min,"Crime, Drama",Frank Darabont,"Stephen King (short story ""Rita Hayworth and S...","Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...,English,USA,Nominated for 7 Oscars. Another 19 wins & 30 n...,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,9.3/10,80.0,9.3,1825626,tt0111161,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/shawshank_rede...,27 Jan 1998,,Columbia Pictures,,True
1,2,The Godfather,1972,R,24 Mar 1972,175 min,"Crime, Drama",Francis Ford Coppola,"Mario Puzo (screenplay), Francis Ford Coppola ...","Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...,"English, Italian, Latin",USA,Won 3 Oscars. Another 23 wins & 27 nominations.,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,9.2/10,100.0,9.2,1243444,tt0068646,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/godfather/,09 Oct 2001,,Paramount Pictures,http://www.thegodfather.com,True
2,3,The Godfather: Part II,1974,R,20 Dec 1974,202 min,"Crime, Drama",Francis Ford Coppola,"Francis Ford Coppola (screenplay), Mario Puzo ...","Al Pacino, Robert Duvall, Diane Keaton, Robert...",The early life and career of Vito Corleone in ...,"English, Italian, Spanish, Latin, Sicilian",USA,Won 6 Oscars. Another 10 wins & 20 nominations.,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,9.0/10,85.0,9.0,856870,tt0071562,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/godfather_part...,24 May 2005,,Paramount Pictures,http://www.thegodfather.com/,True
3,4,The Dark Knight,2008,PG-13,18 Jul 2008,152 min,"Action, Crime, Drama",Christopher Nolan,"Jonathan Nolan (screenplay), Christopher Nolan...","Christian Bale, Heath Ledger, Aaron Eckhart, M...",When the menace known as the Joker emerges fro...,"English, Mandarin","USA, UK",Won 2 Oscars. Another 151 wins & 153 nominations.,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,9.0/10,82.0,9.0,1802351,tt0468569,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/the_dark_knight/,09 Dec 2008,"$533,316,061",Warner Bros. Pictures/Legendary,http://thedarkknight.warnerbros.com/,True
4,5,12 Angry Men,1957,APPROVED,01 Apr 1957,96 min,"Crime, Drama",Sidney Lumet,"Reginald Rose (story), Reginald Rose (screenplay)","Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",A jury holdout attempts to prevent a miscarria...,English,USA,Nominated for 3 Oscars. Another 16 wins & 8 no...,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,8.9/10,96.0,8.9,494215,tt0050083,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/1000013-12_ang...,06 Mar 2001,,Criterion Collection,http://www.criterion.com/films/27871-12-angry-men,True


In [5]:
df.shape

(250, 38)

In [6]:
df = df[["Title","Genre","Director","Actors","Plot"]]

In [7]:
df["Actors"] = df["Actors"].map(lambda x : x.split(",")[:3])

In [8]:
df["Genre"] = df["Genre"].map(lambda x : x.lower().split(","))

In [9]:
df["Genre"]

0                 [crime,  drama]
1                 [crime,  drama]
2                 [crime,  drama]
3        [action,  crime,  drama]
4                 [crime,  drama]
                  ...            
245           [drama,  film-noir]
246                       [drama]
247    [comedy,  drama,  romance]
248           [biography,  drama]
249                       [drama]
Name: Genre, Length: 250, dtype: object

In [10]:
df["Director"] = df["Director"].map(lambda x : x.split(" "))

In [11]:
for index, row in df.iterrows():
    row["Actors"] = [x.lower().replace(" ","") for x in row["Actors"]]
    row["Director"] = "".join(row["Director"]).lower()

In [12]:
df

Unnamed: 0,Title,Genre,Director,Actors,Plot
0,The Shawshank Redemption,"[crime, drama]",frankdarabont,"[timrobbins, morganfreeman, bobgunton]",Two imprisoned men bond over a number of years...
1,The Godfather,"[crime, drama]",francisfordcoppola,"[marlonbrando, alpacino, jamescaan]",The aging patriarch of an organized crime dyna...
2,The Godfather: Part II,"[crime, drama]",francisfordcoppola,"[alpacino, robertduvall, dianekeaton]",The early life and career of Vito Corleone in ...
3,The Dark Knight,"[action, crime, drama]",christophernolan,"[christianbale, heathledger, aaroneckhart]",When the menace known as the Joker emerges fro...
4,12 Angry Men,"[crime, drama]",sidneylumet,"[martinbalsam, johnfiedler, leej.cobb]",A jury holdout attempts to prevent a miscarria...
...,...,...,...,...,...
245,The Lost Weekend,"[drama, film-noir]",billywilder,"[raymilland, janewyman, phillipterry]",The desperate life of a chronic alcoholic is f...
246,Short Term 12,[drama],destindanielcretton,"[brielarson, johngallagherjr., stephaniebeatriz]",A 20-something supervising staff member of a r...
247,His Girl Friday,"[comedy, drama, romance]",howardhawks,"[carygrant, rosalindrussell, ralphbellamy]",A newspaper editor uses every trick in the boo...
248,The Straight Story,"[biography, drama]",davidlynch,"[sissyspacek, janegallowayheitz, josepha.carpe...",An old man makes a long journey by lawn-mover ...


In [13]:
from rake_nltk import Rake

In [14]:
df["Key Words"] = ""

In [15]:

for index, row in df.iterrows():
    r = Rake()    
    r.extract_keywords_from_text (row["Plot"])
    key_word_dict_scores = r.get_word_degrees()
    row["Key Words"] = list (key_word_dict_scores.keys())

In [16]:
df.drop(columns = ["Plot"], inplace = True)

In [17]:
df.set_index("Title", inplace = True)
df.head()

Unnamed: 0_level_0,Genre,Director,Actors,Key Words
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
The Shawshank Redemption,"[crime, drama]",frankdarabont,"[timrobbins, morganfreeman, bobgunton]","[number, finding, solace, years, acts, eventua..."
The Godfather,"[crime, drama]",francisfordcoppola,"[marlonbrando, alpacino, jamescaan]","[clandestine, empire, organized, crime, dynast..."
The Godfather: Part II,"[crime, drama]",francisfordcoppola,"[alpacino, robertduvall, dianekeaton]","[career, vito, corleone, grip, expands, tighte..."
The Dark Knight,"[action, crime, drama]",christophernolan,"[christianbale, heathledger, aaroneckhart]","[people, mysterious, past, gotham, dark, knigh..."
12 Angry Men,"[crime, drama]",sidneylumet,"[martinbalsam, johnfiedler, leej.cobb]","[prevent, justice, forcing, miscarriage, evide..."


In [18]:
df["Bag of Words"] = ""
columns = df.columns
for index, row in df.iterrows():
    words = ""
    for col in columns:
        if col == "Director":
            words = words + " " + row[col]
        else:
            words = words + " " + " ".join(row[col])
    row["Bag of Words"] = words

In [19]:
df.drop(columns = [x for x in df.columns if x != "Bag of Words"], inplace = True)

In [20]:
count = CountVectorizer()
count_matrix = count.fit_transform(df["Bag of Words"])

In [21]:
indices = pd.Series(df.index)
indices[:5]

0    The Shawshank Redemption
1               The Godfather
2      The Godfather: Part II
3             The Dark Knight
4                12 Angry Men
Name: Title, dtype: object

In [22]:
count_matrix

<250x2961 sparse matrix of type '<class 'numpy.int64'>'
	with 5342 stored elements in Compressed Sparse Row format>

In [23]:
c = count_matrix.todense()

In [24]:
type (count_matrix)

scipy.sparse.csr.csr_matrix

In [25]:
print(count_matrix[0,:])

  (0, 584)	1
  (0, 768)	1
  (0, 1011)	1
  (0, 2678)	1
  (0, 1810)	1
  (0, 306)	1
  (0, 1899)	1
  (0, 969)	1
  (0, 2481)	1
  (0, 2950)	1
  (0, 59)	1
  (0, 888)	1
  (0, 2174)	1
  (0, 2765)	1
  (0, 1269)	1
  (0, 1733)	1
  (0, 311)	1
  (0, 519)	1
  (0, 655)	1


In [26]:
cos_sim = cosine_similarity(count_matrix, count_matrix)

In [27]:
cos_sim

array([[1.        , 0.15789474, 0.13764944, ..., 0.05263158, 0.05263158,
        0.05564149],
       [0.15789474, 1.        , 0.36706517, ..., 0.05263158, 0.05263158,
        0.05564149],
       [0.13764944, 0.36706517, 1.        , ..., 0.04588315, 0.04588315,
        0.04850713],
       ...,
       [0.05263158, 0.05263158, 0.04588315, ..., 1.        , 0.05263158,
        0.05564149],
       [0.05263158, 0.05263158, 0.04588315, ..., 0.05263158, 1.        ,
        0.05564149],
       [0.05564149, 0.05564149, 0.04850713, ..., 0.05564149, 0.05564149,
        1.        ]])

In [28]:
def recommendations(title, cos_sim = cos_sim):
    recommended = []
    idx = indices[indices == title].index[0]
    scores = pd.Series(cos_sim[idx]).sort_values(ascending = False)
    top_10 = list(scores[:11].index)
    print(top_10)
    
    for i in top_10:
        recommended.append(list(df.index)[i])
    return recommended

In [29]:
recommendations ("Fargo")

[128, 132, 34, 226, 1, 61, 2, 100, 15, 214, 125]


['Fargo',
 'No Country for Old Men',
 'The Departed',
 'Rope',
 'The Godfather',
 'Reservoir Dogs',
 'The Godfather: Part II',
 'On the Waterfront',
 'Goodfellas',
 'Arsenic and Old Lace',
 'The Big Lebowski']

In [30]:
from collections import defaultdict
from surprise import SVD
from surprise import Dataset

In [31]:
data = Dataset.load_builtin("ml-100k")

In [32]:
t_set = data.build_full_trainset()

In [33]:
t_set

<surprise.trainset.Trainset at 0x1bee4045630>

In [34]:
t_set.ur

defaultdict(list,
            {0: [(0, 3.0),
              (528, 4.0),
              (377, 4.0),
              (522, 3.0),
              (431, 5.0),
              (834, 5.0),
              (380, 4.0),
              (329, 4.0),
              (550, 5.0),
              (83, 4.0),
              (632, 2.0),
              (86, 4.0),
              (289, 5.0),
              (363, 3.0),
              (438, 5.0),
              (389, 5.0),
              (649, 4.0),
              (947, 4.0),
              (423, 3.0),
              (291, 3.0),
              (10, 2.0),
              (1006, 4.0),
              (179, 3.0),
              (751, 3.0),
              (487, 3.0),
              (665, 3.0),
              (92, 4.0),
              (512, 5.0),
              (1045, 3.0),
              (672, 4.0),
              (656, 4.0),
              (221, 5.0),
              (432, 2.0),
              (365, 3.0),
              (321, 2.0),
              (466, 4.0),
              (302, 4.0),
              (491, 3

In [35]:
algo = SVD()
algo.fit(t_set)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1bee6981ac8>

In [36]:
testset = t_set.build_anti_testset()

In [37]:
pred = algo.test(testset)

In [38]:
pred

[Prediction(uid='196', iid='302', r_ui=3.52986, est=4.224028720850314, details={'was_impossible': False}),
 Prediction(uid='196', iid='377', r_ui=3.52986, est=2.7537829690168945, details={'was_impossible': False}),
 Prediction(uid='196', iid='51', r_ui=3.52986, est=3.2207198339427228, details={'was_impossible': False}),
 Prediction(uid='196', iid='346', r_ui=3.52986, est=3.4637281523949306, details={'was_impossible': False}),
 Prediction(uid='196', iid='474', r_ui=3.52986, est=4.055890193255484, details={'was_impossible': False}),
 Prediction(uid='196', iid='265', r_ui=3.52986, est=3.650142438364244, details={'was_impossible': False}),
 Prediction(uid='196', iid='465', r_ui=3.52986, est=3.7908277710860836, details={'was_impossible': False}),
 Prediction(uid='196', iid='451', r_ui=3.52986, est=3.5574445136219723, details={'was_impossible': False}),
 Prediction(uid='196', iid='86', r_ui=3.52986, est=4.005786826499253, details={'was_impossible': False}),
 Prediction(uid='196', iid='1014',

In [39]:
def get_top_n(predictions, n):
    top_n =  defaultdict(list)
    for uid, iid, r_ui, est, _ in predictions:
        top_n[uid].append((iid,est))
    for uid, ratings in top_n.items():
        ratings.sort(key = lambda x: x[1], reverse = True)
        top_n[uid] = ratings[:n]
    return top_n

In [40]:
get_top_n(pred, 10)

defaultdict(list,
            {'196': [('408', 4.547392203402649),
              ('272', 4.538960459079122),
              ('483', 4.483904616408537),
              ('169', 4.444426230276542),
              ('50', 4.43943923113859),
              ('318', 4.407540443724309),
              ('174', 4.380407310850508),
              ('480', 4.37042866186631),
              ('136', 4.340048262456911),
              ('498', 4.319727414483357)],
             '186': [('483', 4.749440210266871),
              ('57', 4.569677680532657),
              ('132', 4.565519248403339),
              ('923', 4.484243989777425),
              ('143', 4.442324324325803),
              ('313', 4.400667197628478),
              ('168', 4.396656202492079),
              ('285', 4.389810781041121),
              ('657', 4.387053723515422),
              ('114', 4.384176574626994)],
             '22': [('12', 4.848845003728868),
              ('272', 4.777187069254289),
              ('56', 4.672194633031874),


In [41]:
from surprise import KNNWithMeans
#from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split

In [42]:
data = Dataset.load_builtin("ml-100k")
train, test = train_test_split(data, test_size = .15)

<surprise.trainset.Trainset at 0x1bee419c978>

In [45]:
algo = KNNWithMeans(k = 50, sim_options = {"name": "pearson_baseline", "user_based":True})
algo.fit(train)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1bee4462470>

In [None]:
uid = str(196)
iid = str(302)
pred = algo.predict(uid, iid, verbose = True)

In [None]:
pred_test = algo.test(test)

In [None]:
pred_test

In [None]:
algo = KNNWithMeans(k = 50, sim_options = {"name":"pearson_baseline", "user_based": False})

In [None]:
algo.fit(train)

In [None]:
algo.predict(uid, iid)

In [None]:
pred_test = algo.test(test)

In [None]:
pred_test

In [None]:
import os

In [None]:
mbo = pd.read_csv("Market_Basket_Optimisation.csv")

In [None]:
tns = []

In [None]:
mbo.shape

In [None]:
for i in range(0,7500):
    #tns.append([str(mbo.values[i,j]) for j in range(0,20)])
    for j in range(0,20):
        tns.append(str(mbo.values[i,j]))

In [None]:
tns

In [None]:
pip install apyori

In [None]:
from apyori import apriori
rules = apriori(tns, min_support = 0.003, min_confidence = 0.2, min_lift = 3, min_length = 3)
results = list(rules)

In [None]:
for item in results:
    pair = item[0]
    items = [x for x in pair]
    print ("Rule: " + items[0] + "->" + items[1])
    print ("Support: " + str(item[1]))
    print ("Confidence: " + str(item[2][0][2]))
    print ("Lift: " + str(item[2][0][3]))
    print ("-------------------------------------")

In [None]:
from scipy.linalg import svd

In [None]:
A = array([[1,2,3],[4,5,6],[7,8,9]])
print(A)

In [None]:
U, s, VT = svd(A)

In [None]:
print(U)

In [None]:
print (s)

In [None]:
print (VT)

In [None]:
sigma = np.diag(s)

In [None]:
print(sigma)

In [None]:
B = U.dot(sigma.dot(VT))
print(B)