# Scrape Data From TMDB

In [88]:
import time

In [94]:
import requests

headers = {
    "accept": "application/json",
}
params = {"api_key": "6bb57152294ead8d0b9c28c0751fa529"}
t1 = time.time()
genre_response = requests.get("https://api.themoviedb.org/3/genre/movie/list?language=en", params = params,headers=headers)
genre_data = genre_response.json()['genres']

movies = {'title':[], 'genre': [], 'overview': []}
for m in range(1,453):
    url = "https://api.themoviedb.org/3/movie/top_rated?language=en-US&page=" + str(m)
    response = requests.get(url, headers=headers, params = params)
    
    for k in range(0,20):
        try:
            title = response.json()['results'][k]['title']
            overview = response.json()['results'][k]['overview']
            genre_id = response.json()['results'][k]['genre_ids']
        except:
            break
        genre = []
        for i in genre_id:
            for j in genre_data:
                if j['id'] == i:
                    genre.append(j['name'])
                elif len(genre_id) == len(genre):
                    break
        movies['title'].append(title)
        movies['genre'].append(genre)
        movies['overview'].append(overview)
t2 = time.time()

In [100]:
import pandas as pd
df = pd.DataFrame(movies)

In [102]:
df.head()

Unnamed: 0,title,genre,overview
0,The Godfather,"[Drama, Crime]","Spanning the years 1945 to 1955, a chronicle o..."
1,The Shawshank Redemption,"[Drama, Crime]",Framed in the 1940s for the double murder of h...
2,The Godfather Part II,"[Drama, Crime]",In the continuing saga of the Corleone crime f...
3,Schindler's List,"[Drama, History, War]",The true story of how businessman Oskar Schind...
4,Dilwale Dulhania Le Jayenge,"[Comedy, Drama, Romance]","Raj is a rich, carefree, happy-go-lucky second..."


# TEXT PREPROCESSING FOR NLP

                                    LOWER CASE

In [107]:
df['overview'] = df['overview'].str.lower()

In [110]:
df['overview'][53]

'ashitaka, a prince of the disappearing emishi people, is cursed by a demonized boar god and must journey to the west to find a cure. along the way, he encounters san, a young human woman fighting to protect the forest, and lady eboshi, who is trying to destroy it. ashitaka must find a way to bring balance to this conflict.'

                                    REMOVE PUNCTUATIONS

In [114]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [115]:
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

remove_punctuation(df['overview'][53])

'ashitaka a prince of the disappearing emishi people is cursed by a demonized boar god and must journey to the west to find a cure along the way he encounters san a young human woman fighting to protect the forest and lady eboshi who is trying to destroy it ashitaka must find a way to bring balance to this conflict'

In [117]:
df['overview'] = df['overview'].apply(remove_punctuation)

                                   REMOVES STOP WORDS

In [143]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')
def remove_stop_words(text):
    text = text.split(' ')
    new_text = []
    for i in text:
        if i not in set(stop_words):
            new_text.append(i)
    return ' '.join(new_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [144]:
remove_stop_words(df['overview'][53])

'ashitaka prince disappearing emishi people cursed demonized boar god must journey west find cure along way encounters san young human woman fighting protect forest lady eboshi trying destroy ashitaka must find way bring balance conflict'

In [146]:
df['overview'] = df['overview'].apply(remove_stop_words)

                                SPELLING CORRECTION

In [148]:
pip install textblob

Collecting textblob
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
     ------------------------------------- 636.8/636.8 kB 13.3 MB/s eta 0:00:00
Installing collected packages: textblob
Successfully installed textblob-0.17.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [155]:
from textblob import TextBlob
def spell_corrector(text):
    
    return str(TextBlob(text).correct())
spell_corrector('grammer with arshlaan')

'grammar with arshlaan'

In [156]:
# do not re-run its not neccessary, spellings are already correct
# df['overview'] = df['overview'].apply(spell_corrector)

                            TOKENIZE TEXT

In [157]:
import nltk
nltk.download('punkt')  # Download the Punkt tokenizer data

from nltk.tokenize import word_tokenize

def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [161]:
tokenize_text(df['overview'][79])

['separated',
 'daughter',
 'father',
 'intellectual',
 'disability',
 'must',
 'prove',
 'innocence',
 'jailed',
 'death',
 'commanders',
 'child']

In [163]:
df['overview'] = df['overview'].apply(tokenize_text)

In [164]:
df

Unnamed: 0,title,genre,overview
0,The Godfather,"[Drama, Crime]","[spanning, years, 1945, 1955, chronicle, ficti..."
1,The Shawshank Redemption,"[Drama, Crime]","[framed, 1940s, double, murder, wife, lover, u..."
2,The Godfather Part II,"[Drama, Crime]","[continuing, saga, corleone, crime, family, yo..."
3,Schindler's List,"[Drama, History, War]","[true, story, businessman, oskar, schindler, s..."
4,Dilwale Dulhania Le Jayenge,"[Comedy, Drama, Romance]","[raj, rich, carefree, happygolucky, second, ge..."
...,...,...,...
9021,Disaster Movie,"[Comedy, Science Fiction]","[filmmaking, team, behind, hits, scary, movie,..."
9022,Battlefield Earth,"[Action, Adventure, Science Fiction]","[year, 3000, man, match, psychlos, greedy, man..."
9023,House of the Dead,"[Horror, Action, Thriller]","[set, island, coast, techno, rave, party, attr..."
9024,Dragonball Evolution,"[Action, Adventure, Fantasy, Science Fiction, ...","[18th, birthday, goku, receives, mystical, dra..."


In [166]:
df.to_csv('movies_data.csv', index = False)