In [5]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from nrclex import NRCLex
import time
import math

In [2]:
profile = pd.read_csv("profiles.csv")
anime = pd.read_csv("anime.csv")
animes = pd.read_csv("animes.csv")
animelist = pd.read_csv("animelist.csv")
rating = pd.read_csv("rating_complete.csv")
watchingStatus = pd.read_csv("watching_status.csv")
reviews = pd.read_csv("reviews.csv")
animeSynopsis = pd.read_csv("anime_with_synopsis.csv")

In [66]:
profile.shape, anime.shape, animes.shape, rating.shape,reviews.shape,animeSynopsis.shape

((81727, 4), (17562, 35), (19311, 12), (57633278, 3), (192112, 7), (16214, 5))

# Process the profile data
In this section we do two things to the profiles.csv, the first is to drop the link column which will not be used in this project, and the second is to add the "num_favorite" column which records the number of favorite anime of each user.

In [4]:
profile.head()

Unnamed: 0,profile,gender,birthday,favorites_anime,link
0,DesolatePsyche,Male,"Oct 2, 1994","['33352', '25013', '5530', '33674', '1482', '2...",https://myanimelist.net/profile/DesolatePsyche
1,baekbeans,Female,"Nov 10, 2000","['11061', '31964', '853', '20583', '918', '925...",https://myanimelist.net/profile/baekbeans
2,skrn,,,"['918', '2904', '11741', '17074', '23273', '32...",https://myanimelist.net/profile/skrn
3,edgewalker00,Male,Sep 5,"['5680', '849', '2904', '3588', '37349']",https://myanimelist.net/profile/edgewalker00
4,aManOfCulture99,Male,"Oct 30, 1999","['4181', '7791', '9617', '5680', '2167', '4382...",https://myanimelist.net/profile/aManOfCulture99


In [40]:
profile = profile.drop("link",1)

In [72]:
length = [len(fav) for fav in profile["favorites_anime"]]
profile["num_favorite"] = length

In [22]:
age = [2022-int(re.findall("[0-9]{4}", text)[0]) if (not pd.isna(text) and len(re.findall("[0-9]{4}", text))!=0)  else None for text in profile.birthday]
profile["age"] = age

In [25]:
profile.to_csv("profiles.csv")

# Handle missing values in the rating.csv file 
As the website indicated, all entries with rating = 0 are actually missing values, so we have to impute them as None. Also, weird values that are not in the indicated range would also be treated as missing.

In [50]:
ratings = animelist["rating"]
cleaned_rating = [rating if rating != 0 else None for rating in ratings]
animelist["rating"] = cleaned_rating

In [59]:
cleaned_watching = [status if status in [1,2,3,4,6] else None for status in animelist["watching_status"]]
animelist["watching_status"] = cleaned_watching

In [61]:
animelist.head()

Unnamed: 0,user_id,anime_id,rating,watching_status,watched_episodes
0,0,67,9.0,1.0,1
1,0,6702,7.0,1.0,4
2,0,242,10.0,1.0,4
3,0,4898,,1.0,1
4,0,21,10.0,1.0,0


In [68]:
animelist = animelist.iloc[:21846732,:]

In [69]:
animelist.to_csv("animeRating.csv")

# For the dataset that stores information of various animes, we also want to delete unimportant columns

In [15]:
anime.columns

Index(['MAL_ID', 'Name', 'Score', 'Genres', 'English name', 'Japanese name',
       'Type', 'Episodes', 'Aired', 'Premiered', 'Producers', 'Licensors',
       'Studios', 'Source', 'Duration', 'Rating', 'Ranked', 'Popularity',
       'Members', 'Favorites', 'Watching', 'Completed', 'On-Hold', 'Dropped',
       'Plan to Watch', 'Score-10', 'Score-9', 'Score-8', 'Score-7', 'Score-6',
       'Score-5', 'Score-4', 'Score-3', 'Score-2', 'Score-1'],
      dtype='object')

In [84]:
name = "Score-"
for i in range(1,11):
    colname = name + str(i)
    anime = anime.drop(colname,1)

In [86]:
anime.to_csv("anime.csv")

# Finally we need to process the review.csv data
Extract the sentiment scores of each review, and also flatten the score dictionary.

In [88]:
reviews = reviews.iloc[:84389,:]

In [92]:
reviews = reviews.drop("link",1)

In [123]:
reviews.text = reviews.text.str.lower()

In [116]:
overall = [int(re.findall("[0-9]+", score)[0]) for score in reviews.scores]
story = [int(re.findall("[0-9]+", score)[1]) for score in reviews.scores]
animation = [int(re.findall("[0-9]+", score)[2]) for score in reviews.scores]
sound = [int(re.findall("[0-9]+", score)[3]) for score in reviews.scores]
character = [int(re.findall("[0-9]+", score)[4]) for score in reviews.scores]
enjoyment = [int(re.findall("[0-9]+", score)[5]) for score in reviews.scores]

In [132]:
reviews["overall"] = overall
reviews["story"] = story
reviews["animation"] = animation 
reviews["sound"] = sound
reviews["character"] = character
reviews["enjoyment"] = enjoyment

In [127]:
start = time.time()
senti_list = []
for i in range(len(reviews.text)):
    senti_list.append(NRCLex(reviews.text[i]))
end = time.time()
print("The encoding process took ", end-start, " s.")

The encoding process took  655.4879891872406  s.


In [128]:
senti_detail = [senti_list[i].affect_frequencies for i in range(len(senti_list))]
fear = [senti_detail[i]["fear"] for i in range(len(senti_detail))]
anger = [senti_detail[i]["anger"] for i in range(len(senti_detail))]
anticip = [senti_detail[i]["anticip"] for i in range(len(senti_detail))]
trust = [senti_detail[i]["trust"] for i in range(len(senti_detail))]
surprise = [senti_detail[i]["surprise"] for i in range(len(senti_detail))]
positive = [senti_detail[i]["positive"] for i in range(len(senti_detail))]
negative = [senti_detail[i]["negative"] for i in range(len(senti_detail))]
sadness = [senti_detail[i]["sadness"] for i in range(len(senti_detail))]
disgust = [senti_detail[i]["disgust"] for i in range(len(senti_detail))]
joy = [senti_detail[i]["joy"] for i in range(len(senti_detail))]
reviews["fear"] = fear
reviews["anger"] = anger
reviews["anticip"] = anticip
reviews["trust"] = trust
reviews["surprise"] = surprise
reviews["positive"] = positive
reviews["negative"] = negative
reviews["sadness"] = sadness
reviews["disgust"] = disgust
reviews["joy"] = joy

In [133]:
reviews.head()

Unnamed: 0,uid,profile,anime_uid,text,score,scores,fear,anger,anticip,trust,...,negative,sadness,disgust,joy,overall,story,animation,sound,character,enjoyment
0,255938,DesolatePsyche,34096,\n \n \n \n ...,8,"{'Overall': '8', 'Story': '8', 'Animation': '8...",0.047619,0.054422,0.0,0.129252,...,0.136054,0.081633,0.054422,0.122449,8,8,8,10,9,8
1,259117,baekbeans,34599,\n \n \n \n ...,10,"{'Overall': '10', 'Story': '10', 'Animation': ...",0.088235,0.029412,0.0,0.134454,...,0.105042,0.058824,0.021008,0.117647,10,10,10,10,10,10
2,253664,skrn,28891,\n \n \n \n ...,7,"{'Overall': '7', 'Story': '7', 'Animation': '9...",0.028169,0.014085,0.0,0.225352,...,0.014085,0.042254,0.0,0.183099,7,7,9,8,8,8
3,8254,edgewalker00,2904,\n \n \n \n ...,9,"{'Overall': '9', 'Story': '9', 'Animation': '9...",0.056769,0.052402,0.0,0.122271,...,0.135371,0.074236,0.078603,0.09607,9,9,9,10,10,9
4,291149,aManOfCulture99,4181,\n \n \n \n ...,10,"{'Overall': '10', 'Story': '10', 'Animation': ...",0.072727,0.036364,0.0,0.181818,...,0.054545,0.054545,0.018182,0.181818,10,10,8,9,10,10


In [135]:
reviews = reviews.drop("scores", 1)
reviews = reviews.drop("uid", 1)
reviews = reviews.drop("anime_uid", 1)

In [136]:
reviews.to_csv("reviews.csv")