This notebook does the following:
    - Generates a table of tags with the number of times they've been searched
    - A demo of recommending tags based on Tag Search History

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Connect to the database
import pymysql.cursors
import json

userId = []
loginId = []
tagSearch = []
tstamp = []
# Connect to the database
connection = pymysql.connect(host="127.0.0.1",
                             user="web",
                             password="atth1132",
                             db="ML3_mirror")

try:
    with connection.cursor() as cursor:
        # This query select rows from log_trailer_actions where the action is either TrailerModalLaunched or Recommendations Refreshed
        # These two actions indicate that a recommendation list is generated. 
        
        sql = """SELECT * FROM log_action
        WHERE action = "pageview" and logJson LIKE ("%base.explore%") and logJson LIKE ("%tag%")
        """
        
        cursor.execute(sql)

        for row in cursor:
            logjson = json.loads(row[4])
            params = logjson["params"]
            
            if ('tag' in params):
                tagList = params['tag'].split(',')
                for tag in tagList:
                    userId.append(row[1])
                    loginId.append(row[2])
                    tagSearch.append(tag)
                    tstamp.append(row[5])
                    
finally: 
    connection.close()


KeyboardInterrupt: 

In [3]:
tag_search2 = pd.DataFrame({"userId":userId, "loginId":loginId,"tagSearch":tagSearch,"tstamp":tstamp}) 

#### Save the dataframe to csv file

#### this file saves a list of tags searched in one entry


In [15]:
tag_search.to_csv("Datasets/tag_search_history.csv")

#### this file split a list of tags searched into multiple entries

In [1]:
tag_search2.to_csv("Datasets/tag_search_history_2.csv")

NameError: name 'tag_search2' is not defined

In [25]:
tag_search2.groupby(["userId","tagSearch"]).count().sort_values(by="loginId").reset_index()

Unnamed: 0,userId,tagSearch,loginId,tstamp
0,254463,dramatic,1,1
1,255479,bittersweet,1,1
2,262827,good versus evil,1,1
3,262827,mythology,1,1
4,262827,special effects,1,1
5,262827,touching,1,1
6,262829,ai,1,1
7,262829,artificial intelligence,1,1
8,262829,blood,1,1
9,262829,chick flick,1,1


### Recommending tags based on Tag Search History

#### A demo of recommending tags based on Tag Search History for user 134088

In [4]:
tag_search = pd.read_csv("Datasets/tag_search_history_2.csv")
tag_search.rename(columns={"tagSearch":"tag"},inplace=True)

In [10]:
# Load tag search history for user 134088

tag_search_134088 = tag_search[tag_search.userId == 134088]

In [11]:
tag_search_134088 = tag_search[tag_search.userId == 134088]
tag_search_134088_agg = tag_search_134088.groupby(["userId","tag"]).count().reset_index()
tag_search_134088_agg.drop(["loginId","tstamp"],axis=1,inplace=True)
tag_search_134088_agg.rename(columns={"Unnamed: 0":"num_tag_search"},inplace=True)

#### Merge Tag-search table with Tag-Score table

In [12]:
# This block counts how many times a tag has been applied; 
# Used for weighting user tags

import pymysql.cursors
import json

# Connect to the database
connection = pymysql.connect(host="127.0.0.1",
                             user="web",
                             password="atth1132",
                             db="ML3_mirror")
try:
    with connection.cursor() as cursor:
            sql = "SELECT * FROM tag_movie"
            cursor.execute(sql)
            result = cursor.fetchall()
            
finally: 
    connection.close()


In [13]:
movie_tag_app = pd.DataFrame(list(result), columns=['movieId','tag','numApps','numPositive','numNeutral','numNegative','numDownvotes','score'])
movie_tag_app


Unnamed: 0,movieId,tag,numApps,numPositive,numNeutral,numNegative,numDownvotes,score
0,1,2009 reissue in Stereoscopic 3-D,1,1,0,0,2,-1.0
1,1,3D,3,0,3,0,3,0.0
2,1,55 movies every kid should see--Entertainment ...,1,0,1,0,2,-1.0
3,1,action,1,0,1,0,1,0.0
4,1,action figure,1,0,1,0,2,-1.0
5,1,action figures,1,0,1,0,1,0.0
6,1,adventure,17,14,3,0,4,13.0
7,1,almost favorite,1,0,1,0,5,-4.0
8,1,American Animation,1,0,1,0,1,0.0
9,1,animated,17,9,5,3,5,12.0


#### Sum the scores of movie-tag across all movies

In [14]:
movie_tag_score_sum = movie_tag_app.groupby(["tag"]).sum().reset_index().sort_values(by="score",ascending=False)
movie_tag_score_sum.head(10)

Unnamed: 0,tag,movieId,numApps,numPositive,numNeutral,numNegative,numDownvotes,score
61428,sci-fi,29177589,8998,6811,2064,123,1284,7714.0
29521,atmospheric,22176986,6034,5524,483,27,763,5271.0
65957,surreal,29702438,5047,4256,669,122,554,4493.0
27521,action,62353012,5917,3687,1996,234,1515,4402.0
68716,twist ending,14480865,4614,4031,476,107,589,4025.0
70135,visually appealing,20031591,4102,3854,232,16,307,3795.0
39157,dystopia,26252247,4124,3195,824,105,367,3757.0
35356,comedy,103626267,5557,3654,1813,90,1902,3655.0
71389,woman director,404272929,3621,5,3616,0,0,3621.0
37108,dark comedy,31890149,3847,3267,532,48,421,3426.0


#### Match User's Tag-Search-History with Tag Score Sums

In [15]:
user_134088_tag_activity = pd.merge(tag_search_134088_agg, movie_tag_score_sum, on=['tag'], how='left')
user_134088_tag_activity.dropna(inplace=True)

In [21]:
user_134088_tag_activity_agg = user_134088_tag_activity.groupby(["tag"]).sum().reset_index().sort_values(by="score",ascending=False)

# Choose 10 as score threshold
# Exclude tags with a score < 10
user_134088_tag_activity_agg = user_134088_tag_activity_agg.loc[user_134088_tag_activity_agg["score"]>=10]
user_134088_tag_activity_agg.head(10)


Unnamed: 0,tag,userId,num_tag_search,movieId,numApps,numPositive,numNeutral,numNegative,numDownvotes,score
46,surreal,134088,1,29702438.0,5047.0,4256.0,669.0,122.0,554.0,4493.0
1,action,134088,1,62353012.0,5917.0,3687.0,1996.0,234.0,1515.0,4402.0
36,psychology,134088,1,12818669.0,3556.0,3123.0,394.0,39.0,399.0,3157.0
42,space,134088,1,20550858.0,3044.0,2028.0,976.0,40.0,316.0,2728.0
41,social commentary,134088,2,23265238.0,2934.0,2503.0,346.0,85.0,355.0,2579.0
45,superhero,134088,1,25827465.0,2797.0,1574.0,968.0,255.0,315.0,2482.0
6,black comedy,134088,1,10241959.0,2103.0,1819.0,231.0,53.0,300.0,1803.0
25,mindfuck,134088,9,5545415.0,1639.0,1439.0,147.0,53.0,185.0,1454.0
34,politics,134088,1,48412050.0,1749.0,746.0,938.0,65.0,428.0,1321.0
40,sex,134088,1,75720383.0,1182.0,218.0,930.0,34.0,111.0,1071.0


In [18]:
#user_134088_tag_activity_agg.sort_values(by=["num_tag_search","score"],ascending=False).head(10)

#### Filter out tags that are not in the candidate set

In [19]:
tag_genome_trimmed = pd.read_csv("Datasets/Tag_Genome_with_Score_Trimmed.csv")
tag_candidates = tag_genome_trimmed['tag'].tolist()

In [20]:
# Rank tags by user_num_app, then by scores
user_134088_tag_activity_agg[user_134088_tag_activity_agg['tag'].isin(tag_candidates)].sort_values(by=["num_tag_search","score"],ascending=False).head(10)

Unnamed: 0,tag,userId,num_tag_search,movieId,numApps,numPositive,numNeutral,numNegative,numDownvotes,score
25,mindfuck,134088,9,5545415.0,1639.0,1439.0,147.0,53.0,185.0,1454.0
11,disney,134088,4,6244146.0,195.0,75.0,111.0,9.0,25.0,170.0
49,toys,134088,3,2088965.0,86.0,25.0,56.0,5.0,19.0,67.0
41,social commentary,134088,2,23265238.0,2934.0,2503.0,346.0,85.0,355.0,2579.0
23,loneliness,134088,2,12377546.0,949.0,586.0,342.0,21.0,108.0,841.0
17,gangster,134088,2,32321490.0,543.0,95.0,441.0,7.0,49.0,494.0
7,car chase,134088,2,9762464.0,541.0,244.0,248.0,49.0,120.0,421.0
46,surreal,134088,1,29702438.0,5047.0,4256.0,669.0,122.0,554.0,4493.0
1,action,134088,1,62353012.0,5917.0,3687.0,1996.0,234.0,1515.0,4402.0
36,psychology,134088,1,12818669.0,3556.0,3123.0,394.0,39.0,399.0,3157.0
