In [None]:
!pip install -U srt
!pip install nltk
!pip install pandas==1.1.0 as pd

Requirement already up-to-date: srt in /usr/local/lib/python3.6/dist-packages (3.4.1)


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk import word_tokenize
import nltk
import os
import srt
from srt import SRTParseError
import pickle
import pandas as pd
import numpy as np

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
dir = '/content/drive/My Drive/Master/subtitles2'
subs = dict()
full_vocab = dict()
patterns = ['((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[.\!\/\\w]*))?)', '\S*@\S*\s?', 
            '[^a-zA-Z]', 
            '\d+',
            'Advertise', 'product', 'brand', 'here', 'contact', 
            'today', 'english', 'sdh', 'nord', 'usd']


In [None]:
# Remove stop words from a tokenized string
def remove_stopwords(tokens):

  stop_w = set(stopwords.words('english'))
  return filter(lambda x: x not in stop_w, tokens)


# Apply stemming to a tokenized string
def stem(tokens):
  
  stemmer = SnowballStemmer('english')
  return map(stemmer.stem, tokens)


def remove_patterns(text, patterns=patterns):
  for pattern in patterns:
    text = re.sub(pattern, ' ', text, flags=re.IGNORECASE)
  return text


# Reduce letters that are repeated 3 times or more sequentially to 2
def reduce_repeated(text):
  return re.sub(r"(.)\1{2,}", r"\1\1", text, flags=re.IGNORECASE)


# Remove words with less than three characters
def remove_shortwords(tokens):
  return filter(lambda x: len(x) > 2, tokens)


def clean_subs(subs_dict):
  for movie in subs_dict.keys():
    cleaned = remove_patterns(subs_dict[movie], patterns)
    cleaned = reduce_repeated(cleaned)
    cleaned = word_tokenize(cleaned)
    cleaned = remove_stopwords(cleaned)
    cleaned = stem(cleaned)
    cleaned = remove_stopwords(cleaned)
    cleaned = remove_shortwords(cleaned)
    subs_dict[movie] = cleaned
  return subs_dict

# Parse an srt file and return its text content as a string
def subfile_to_string(subfile):

  sub_generator = srt.parse(subfile)
  subtitles = []

  try:
    subtitles = list(sub_generator)
  except SRTParseError:
    print(f'Unable to parse subtitle file. {subfile.name} will not be included.')

  if len(subtitles) < 1: 
    return None
  else:
    subtitles.pop(0)
    subtitles.pop(len(subtitles) - 1)
    sub_string = ''
    for i in range(len(subtitles)):
        sub_string += f' {subtitles[i].content}'

    return sub_string

# Take a list of words and return a vocab
def tokens_to_vocab(tokens):
  vocab = dict()
  for word in tokens:
    if word not in vocab.keys():
      vocab[word] = [1]
    else:
      vocab[word] = [vocab[word][0] + 1]
  return vocab

# Take a vocab and add it to another vocab
def add_to_full_vocab(vocab, full_vocab):
  for key in vocab.keys():
    if key not in full_vocab.keys():
      full_vocab[key] = vocab[key][0]
    else:
      full_vocab[key] += vocab[key][0]
  return full_vocab


In [None]:
# Create dict with subtitles
for subdir in os.listdir(dir):
  for sub in os.listdir(f'{dir}/{subdir}'):
    infile = open(f'{dir}/{subdir}/{sub}', 'rt', encoding='iso-8859-1')
    subtitles = subfile_to_string(infile)
    if subtitles is not None:
      subs[sub[:-4].lstrip('0')] = subtitles
    infile.close()

In [None]:
pickle.dump(subs, (open('/content/drive/My Drive/Master/Data/subs.p', 'wb')))

In [None]:
infile = open('/content/drive/My Drive/Master/Data/subs.p', 'rb')
subs = pickle.load(infile)
infile.close()

In [None]:
subs_cleaned = clean_subs(subs)

for sub in subs_cleaned.keys():
  subs_cleaned[sub] = tokens_to_vocab(subs_cleaned[sub])
  full_vocab = add_to_full_vocab(subs_cleaned[sub], full_vocab)


In [None]:
pickle.dump(subs_cleaned, (open('/content/drive/My Drive/Master/Data/subs_cleaned.p', 'wb')))
pickle.dump(full_vocab, (open('/content/drive/My Drive/Master/Data/subs_vocab.p', 'wb')))

In [None]:
infile = open('/content/drive/My Drive/Master/Data/subs_cleaned.p', 'rb')
subs_cleaned = pickle.load(infile)
infile.close()
infile = open('/content/drive/My Drive/Master/Data/subs_vocab.p', 'rb')
full_vocab = pickle.load(infile)
infile.close()

In [None]:
infile = open('/content/drive/My Drive/Master/Data/subtitle_movies.p', 'rb')
movies = pickle.load(infile)
infile.close()
movies.drop(columns=['title', 'genres'], inplace=True)
movielens_df = pd.read_csv('/content/drive/My Drive/Master/Data/10Mratings.dat', sep="::", usecols = [0, 1], names = ['userId', 'movieId'], engine = 'python')

In [None]:
for ids in movies.itertuples():
  if ids[1] not in movielens_df['movieId'].unique():
    movies.drop(index=ids[0], inplace=True)


Unnamed: 0,movieId,imdbId
88,89,113972
90,92,117002
91,93,114825
92,94,115639
93,95,115759
...,...,...
13255,64926,36629
13258,64944,52794
13262,64969,1068680
13274,65025,43476


In [None]:
imdbIds = []

for ids in movies.itertuples():
  
  if ids[2] not in subs_cleaned.keys():
    movies.drop(index=ids[0], inplace=True)
  else:
    imdbIds.append(ids[2])


In [None]:
diff = []
for id in subs_cleaned.keys():
  if id not in imdbIds:
    diff.append(id)
for id in diff:
  subs_cleaned.pop(id)
print(len(subs_cleaned.keys()))

3595


In [None]:
subs_df = pd.concat(map(pd.DataFrame, subs_cleaned.values()), keys=subs_cleaned.keys()).reset_index()
subs_df['imdbId'] = subs_df['level_0']
subs_df.drop(columns=['level_0', 'level_1'], inplace=True)

In [None]:
pickle.dump(subs_df, (open('/content/drive/My Drive/Master/Data/10Msubs_df.p', 'wb')))

In [None]:
infile = open('/content/drive/My Drive/Master/Data/10Msubs_df.p', 'rb')
subs_df = pickle.load(infile)
infile.close()

In [None]:
subs_df

Unnamed: 0,tell,secret,verin,love,day,pierr,everyth,would,like,perfect,feel,cold,want,talk,mean,upset,care,much,good,cruel,wish,realli,sorri,stop,get,whi,said,come,let,dare,fault,explain,rough,littl,tramp,hurri,shut,madam,beg,help,...,sakuma,keiichi,mido,keiichiro,yoshinao,kawano,kiichiro,katayama,soichiro,amamiya,rumiko,mimura,sapporo,kurihara,nakagawa,yuichiro,asamiya,howa,yrs,uninfluenc,nosewheel,meti,uhmm,puccio,playdat,lookidi,fira,mirand,tristin,djoum,salvi,bfg,bottler,kurtzman,cyrin,bonafid,cuanto,tiempo,lennini,imdbId
0,26.0,3.0,11.0,27.0,17.0,21.0,9.0,20.0,59.0,2.0,10.0,7.0,43.0,7.0,4.0,1.0,8.0,15.0,26.0,2.0,8.0,9.0,8.0,4.0,26.0,11.0,3.0,43.0,28.0,2.0,5.0,6.0,2.0,10.0,3.0,6.0,8.0,17.0,5.0,8.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,61395
1,18.0,1.0,,3.0,8.0,,1.0,7.0,25.0,1.0,1.0,1.0,3.0,8.0,7.0,,3.0,4.0,22.0,,16.0,,3.0,4.0,30.0,25.0,3.0,53.0,27.0,,1.0,,2.0,46.0,,15.0,2.0,,1.0,12.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,32910
2,8.0,13.0,,21.0,11.0,,6.0,3.0,24.0,1.0,1.0,,10.0,5.0,12.0,,,6.0,4.0,,3.0,4.0,2.0,1.0,11.0,13.0,9.0,14.0,3.0,,,1.0,,2.0,,2.0,,6.0,,2.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,58898
3,8.0,1.0,,13.0,12.0,,2.0,18.0,38.0,3.0,10.0,3.0,22.0,7.0,7.0,1.0,4.0,10.0,7.0,,2.0,7.0,1.0,5.0,23.0,8.0,9.0,26.0,9.0,,1.0,1.0,,10.0,,1.0,1.0,,2.0,2.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,61495
4,4.0,2.0,,2.0,8.0,,4.0,5.0,15.0,1.0,5.0,3.0,10.0,3.0,,,,7.0,7.0,,1.0,4.0,,6.0,17.0,5.0,6.0,42.0,10.0,1.0,,,,3.0,,3.0,,,1.0,2.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,51980
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1503,10.0,8.0,,12.0,19.0,,2.0,28.0,13.0,1.0,9.0,2.0,14.0,5.0,11.0,1.0,7.0,14.0,20.0,,7.0,3.0,6.0,4.0,33.0,8.0,4.0,47.0,20.0,1.0,1.0,,,16.0,,6.0,1.0,,1.0,11.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,120737
1504,10.0,2.0,,8.0,15.0,,9.0,43.0,77.0,1.0,23.0,1.0,30.0,11.0,19.0,,4.0,20.0,21.0,,2.0,35.0,,8.0,43.0,9.0,33.0,35.0,23.0,,,,1.0,15.0,,,,,,9.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,428441
1505,10.0,,,2.0,1.0,,10.0,10.0,11.0,,6.0,,15.0,5.0,7.0,,4.0,5.0,5.0,,4.0,2.0,10.0,12.0,29.0,7.0,5.0,30.0,22.0,,,8.0,,4.0,,1.0,5.0,6.0,,15.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,970416
1506,14.0,,,,3.0,,1.0,3.0,9.0,,,1.0,14.0,7.0,5.0,,9.0,6.0,4.0,,1.0,6.0,8.0,6.0,12.0,11.0,8.0,24.0,11.0,,1.0,,,,,7.0,3.0,,,2.0,...,1.0,1.0,28.0,5.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,6.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,,,,,,,,,,,,,,,,,,1478965


In [None]:
subs_df['imdbId'] = subs_df['imdbId'].astype(int)
movies['imdbId'] = movies['imdbId'].astype(int)

subs_df = pd.merge(subs_df, movies, on='imdbId').drop(columns='imdbId')

