# NLP Pipeline Step By Step
### 5000 Movies Dataset Preprocessing

In [58]:
import pandas as pd
import numpy as np

In [59]:
df=pd.read_csv("/content/tmdb_5000_movies.csv")

In [60]:
df.sample()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
778,60000000,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 878, ""na...",http://www.meetdavemovie.com/,11260,"[{""id"": 242, ""name"": ""new york""}, {""id"": 1316,...",en,Meet Dave,A crew of miniature aliens operate a spaceship...,18.676291,"[{""name"": ""Twentieth Century Fox Film Corporat...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2008-07-08,50650079,90.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,There's a Whole Other World Going on Inside of...,Meet Dave,5.0,371


In [61]:
df=df[['title','overview']]
df.head()

Unnamed: 0,title,overview
0,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,Spectre,A cryptic message from Bond’s past sends him o...
3,The Dark Knight Rises,Following the death of District Attorney Harve...
4,John Carter,"John Carter is a war-weary, former military ca..."


In [62]:
df.shape

(4803, 2)

In [63]:
df.drop(df.index[100:], inplace=True)

In [64]:
df.shape

(100, 2)

In [65]:
df.overview[3]

"Following the death of District Attorney Harvey Dent, Batman assumes responsibility for Dent's crimes to protect the late attorney's reputation and is subsequently hunted by the Gotham City Police Department. Eight years later, Batman encounters the mysterious Selina Kyle and the villainous Bane, a new terrorist leader who overwhelms Gotham's finest. The Dark Knight resurfaces to protect a city that has branded him an enemy."

In [66]:
df.isnull().sum()

title       0
overview    0
dtype: int64

In [67]:
df.duplicated().sum()

0

## Removing null Values

In [68]:
df.dropna(inplace=True)

In [69]:
df.isnull().sum()

title       0
overview    0
dtype: int64

## Preprocessing On Overview Column

## Removing Chat Words

In [70]:
chatwords = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "B4N": "Bye For Now",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you (also a chat program)",
    "ILU": "ILU: I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laugh My A.. Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A..",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F...",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laugher",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "LOL": "Laughing out loud",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I don’t care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "IDC": "I don’t care",
    "ZZZ": "Sleeping, bored, tired",
    "WYWH": "Wish you were here",
    "TIME": "Tears in my eyes",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "LMAO": "Laughing my a** off",
    "BFF": "Best friends forever",
    "CSL": "Can’t stop laughing"
}

print(chatwords)


{'AFAIK': 'As Far As I Know', 'AFK': 'Away From Keyboard', 'ASAP': 'As Soon As Possible', 'ATK': 'At The Keyboard', 'ATM': 'At The Moment', 'A3': 'Anytime, Anywhere, Anyplace', 'BAK': 'Back At Keyboard', 'BBL': 'Be Back Later', 'BBS': 'Be Back Soon', 'BFN': 'Bye For Now', 'B4N': 'Bye For Now', 'BRB': 'Be Right Back', 'BRT': 'Be Right There', 'BTW': 'By The Way', 'B4': 'Before', 'CU': 'See You', 'CUL8R': 'See You Later', 'CYA': 'See You', 'FAQ': 'Frequently Asked Questions', 'FC': 'Fingers Crossed', 'FWIW': "For What It's Worth", 'FYI': 'For Your Information', 'GAL': 'Get A Life', 'GG': 'Good Game', 'GN': 'Good Night', 'GMTA': 'Great Minds Think Alike', 'GR8': 'Great!', 'G9': 'Genius', 'IC': 'I See', 'ICQ': 'I Seek you (also a chat program)', 'ILU': 'ILU: I Love You', 'IMHO': 'In My Honest/Humble Opinion', 'IMO': 'In My Opinion', 'IOW': 'In Other Words', 'IRL': 'In Real Life', 'KISS': 'Keep It Simple, Stupid', 'LDR': 'Long Distance Relationship', 'LMAO': 'Laughing my a** off', 'LOL': 'L

In [71]:
def chat_conversation(text):
    new_text=[]
    for word in text.split():
        if word.upper() in chatwords:
            new_text.append(chatwords[word.upper()])
        else:
            new_text.append(word)
    return " ".join(new_text)

In [72]:
chat_conversation('IMHO he is the best')

'In My Honest/Humble Opinion he is the best'

In [73]:
df['overview']=df['overview'].apply(chat_conversation)

In [74]:
df['overview'][3]

"Following the death of District Attorney Harvey Dent, Batman assumes responsibility for Dent's crimes to protect the late attorney's reputation and is subsequently hunted by the Gotham City Police Department. Eight years later, Batman encounters the mysterious Selina Kyle and the villainous Bane, a new terrorist leader who overwhelms Gotham's finest. The Dark Knight resurfaces to protect a city that has branded him an enemy."

## Removing Stopwords

In [75]:
!pip install nltk
import nltk
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [76]:
from nltk.corpus import stopwords
stopwords=stopwords.words('english')
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [77]:
lst=[]
def remove_stopwords(text):
    filtered=[word for word in text.split() if word not in stopwords]
    return " ".join(filtered)

In [78]:
df['overview']=df['overview'].apply(remove_stopwords)

In [79]:
df.overview[3]

"Following death District Attorney Harvey Dent, Batman assumes responsibility Dent's crimes protect late attorney's reputation subsequently hunted Gotham City Police Department. Eight years later, Batman encounters mysterious Selina Kyle villainous Bane, new terrorist leader overwhelms Gotham's finest. The Dark Knight resurfaces protect city branded enemy."

## Spelling Correction

In [80]:
from textblob import TextBlob


In [81]:
def spelling_correction(text):
    blb=TextBlob(text)
    return blb.correct().string

In [82]:
spelling_correction("He is really certane amount of time")

'He is really certain amount of time'

In [83]:
df['overview']=df['overview'].apply(spelling_correction)

In [84]:
df.overview[3]

"Following death District Attorney Harvey Went, Batman assumes responsibility Went's crimes protect late attorney's reputation subsequently hunted Gotham City Police Department. Right years later, Batman encounters mysterious Helena Able villains Lane, new terrorist leader overwhelm Gotham's finest. The Dark Night surfaces protect city branded enemy."

## Remove Punctuations

In [85]:
import string
from string import punctuation

In [86]:
def remove_punctuation(text):
    for char in text:
        if char in punctuation:
            text=text.replace(char,"")
    return text

In [87]:
remove_punctuation('My name is Abdul Rehman-1302#@//;.;\/?<>')

'My name is Abdul Rehman1302'

In [88]:
df['overview']=df['overview'].apply(remove_punctuation)

In [89]:
df.overview[3]

'Following death District Attorney Harvey Went Batman assumes responsibility Wents crimes protect late attorneys reputation subsequently hunted Gotham City Police Department Right years later Batman encounters mysterious Helena Able villains Lane new terrorist leader overwhelm Gothams finest The Dark Night surfaces protect city branded enemy'

## Convert the Text to Lower

In [90]:
df['overview']=df['overview'].str.lower()

In [91]:
df['overview'][3]

'following death district attorney harvey went batman assumes responsibility wents crimes protect late attorneys reputation subsequently hunted gotham city police department right years later batman encounters mysterious helena able villains lane new terrorist leader overwhelm gothams finest the dark night surfaces protect city branded enemy'

##Stemming

In [92]:
import nltk
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [93]:
def stem_words(text):
    return " ".join(stemmer.stem(word) for word in text.split())

In [94]:
stem_words(' following are the wording of the runner Ali')

'follow are the word of the runner ali'

In [95]:
df['overview']=df['overview'].apply(stem_words)

In [96]:
df['overview'][3]

'follow death district attorney harvey went batman assum respons went crime protect late attorney reput subsequ hunt gotham citi polic depart right year later batman encount mysteri helena abl villain lane new terrorist leader overwhelm gotham finest the dark night surfac protect citi brand enemi'

## Tokenization

In [42]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [43]:
def tokenization(text):
    tokens=word_tokenize(text)
    return tokens

In [44]:
df['overview']=df['overview'].apply(tokenization)

In [45]:
df['overview']

0     [in, and, centuri, paraplegia, marin, dispatch...
1     [captain, barbossa, long, believ, dead, come, ...
2     [a, cystic, messag, fond, ’, past, send, trail...
3     [follow, death, district, attorney, harvey, we...
4     [john, garter, warweari, former, militari, cap...
                            ...                        
95    [interstellar, chronicl, adventur, group, expl...
96    [cobb, skill, thief, commit, corpor, espionag,...
97    [from, mind, behind, evangelion, come, hit, la...
98    [ilio, margin, habit, enjoy, quiet, life, swep...
99    [domen, toretto, os, angel, street, race, susp...
Name: overview, Length: 100, dtype: object

In [46]:
df.head()

Unnamed: 0,title,overview
0,Avatar,"[in, and, centuri, paraplegia, marin, dispatch..."
1,Pirates of the Caribbean: At World's End,"[captain, barbossa, long, believ, dead, come, ..."
2,Spectre,"[a, cystic, messag, fond, ’, past, send, trail..."
3,The Dark Knight Rises,"[follow, death, district, attorney, harvey, we..."
4,John Carter,"[john, garter, warweari, former, militari, cap..."


# Feature Extraction

## 1. One Hot Encode

In [47]:
vocabolary=[token for sublist in df['overview'] for token in sublist ]
print(vocabolary)

['in', 'and', 'centuri', 'paraplegia', 'marin', 'dispatch', 'moon', 'candor', 'uniqu', 'mission', 'becom', 'torn', 'follow', 'order', 'protect', 'alien', 'civil', 'captain', 'barbossa', 'long', 'believ', 'dead', 'come', 'back', 'life', 'head', 'edg', 'earth', 'will', 'turner', 'elizabeth', 'want', 'but', 'noth', 'quit', 'seem', 'a', 'cystic', 'messag', 'fond', '’', 'past', 'send', 'trail', 'uncov', 'sinist', 'organ', 'while', 'm', 'battl', 'polit', 'forc', 'keep', 'secret', 'servic', 'aliv', 'fond', 'feel', 'back', 'layer', 'deceit', 'reveal', 'terribl', 'truth', 'behind', 'spectr', 'follow', 'death', 'district', 'attorney', 'harvey', 'went', 'batman', 'assum', 'respons', 'went', 'crime', 'protect', 'late', 'attorney', 'reput', 'subsequ', 'hunt', 'gotham', 'citi', 'polic', 'depart', 'right', 'year', 'later', 'batman', 'encount', 'mysteri', 'helena', 'abl', 'villain', 'lane', 'new', 'terrorist', 'leader', 'overwhelm', 'gotham', 'finest', 'the', 'dark', 'night', 'surfac', 'protect', 'cit

In [48]:
len(vocabolary)

3602

In [49]:
token_to_index = {token: i for i, token in enumerate(vocabolary)}
print("Token to Index Mapping:", token_to_index)

Token to Index Mapping: {'in': 2857, 'and': 2931, 'centuri': 3011, 'paraplegia': 3, 'marin': 3022, 'dispatch': 5, 'moon': 6, 'candor': 7, 'uniqu': 2419, 'mission': 3205, 'becom': 3106, 'torn': 1661, 'follow': 1571, 'order': 3428, 'protect': 3342, 'alien': 3334, 'civil': 2502, 'captain': 3137, 'barbossa': 18, 'long': 1727, 'believ': 3326, 'dead': 2539, 'come': 3512, 'back': 3438, 'life': 3560, 'head': 2088, 'edg': 26, 'earth': 3447, 'will': 2595, 'turner': 29, 'elizabeth': 30, 'want': 3459, 'but': 3543, 'noth': 3345, 'quit': 34, 'seem': 2981, 'a': 3528, 'cystic': 1252, 'messag': 38, 'fond': 1088, '’': 3349, 'past': 2493, 'send': 2873, 'trail': 1097, 'uncov': 3536, 'sinist': 45, 'organ': 2699, 'while': 2160, 'm': 1083, 'battl': 3271, 'polit': 50, 'forc': 3352, 'keep': 2760, 'secret': 3207, 'servic': 54, 'aliv': 2960, 'feel': 3163, 'layer': 59, 'deceit': 60, 'reveal': 3136, 'terribl': 255, 'truth': 384, 'behind': 3510, 'spectr': 65, 'death': 2215, 'district': 2313, 'attorney': 2314, 'harv

In [52]:
from sklearn.preprocessing import OneHotEncoder
onehot_encoder = OneHotEncoder(categories=[range(len(vocabolary))], sparse=False,dtype=int)
onehot_encoded_data = []

for sample in df['overview']:
    encoded_sample = [token_to_index[token] for token in sample]
    onehot_encoded_sample = onehot_encoder.fit_transform(np.array(encoded_sample).reshape(-1, 1))
    onehot_encoded_data.append(onehot_encoded_sample.flatten())

print("One-Hot Encoded Data:")
for sample in onehot_encoded_data:
    print(sample)



One-Hot Encoded Data:
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0



In [53]:
sample.shape

(100856,)

### 2. Bag Of Words

In [97]:
from sklearn.feature_extraction.text import  CountVectorizer
cv=CountVectorizer()

In [99]:
bow=cv.fit_transform(df['overview'])

In [101]:
bow.shape

(100, 1682)

In [102]:
print(cv.vocabulary_)

{'in': 765, 'and': 86, 'centuri': 250, 'paraplegia': 1086, 'marin': 933, 'dispatch': 438, 'moon': 987, 'candor': 226, 'uniqu': 1558, 'mission': 980, 'becom': 161, 'torn': 1507, 'follow': 591, 'order': 1060, 'protect': 1167, 'alien': 61, 'civil': 281, 'captain': 231, 'barbossa': 147, 'long': 896, 'believ': 170, 'dead': 385, 'come': 305, 'back': 137, 'life': 884, 'head': 693, 'edg': 475, 'earth': 469, 'will': 1647, 'turner': 1538, 'elizabeth': 483, 'want': 1606, 'but': 217, 'noth': 1030, 'quit': 1188, 'seem': 1305, 'cystic': 373, 'messag': 960, 'fond': 592, 'past': 1098, 'send': 1310, 'trail': 1516, 'uncov': 1549, 'sinist': 1341, 'organ': 1063, 'while': 1635, 'battl': 155, 'polit': 1137, 'forc': 594, 'keep': 835, 'secret': 1301, 'servic': 1316, 'aliv': 63, 'feel': 561, 'layer': 866, 'deceit': 393, 'reveal': 1241, 'terribl': 1462, 'truth': 1534, 'behind': 168, 'spectr': 1375, 'death': 390, 'district': 440, 'attorney': 123, 'harvey': 688, 'went': 1629, 'batman': 154, 'assum': 120, 'respons

In [103]:
bow=bow.toarray()

In [104]:
bow

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [106]:
bow[3]

array([0, 0, 0, ..., 0, 0, 0])

In [107]:
# Print each element of the array
for element in bow[3]:
        print(element, end=' ')

0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 

## 3. N-Grams

In [108]:
cv=CountVectorizer(ngram_range=(4,4))
n_grams=cv.fit_transform(df['overview'])

In [109]:
print(cv.vocabulary_)

{'in and centuri paraplegia': 1404, 'and centuri paraplegia marin': 161, 'centuri paraplegia marin dispatch': 480, 'paraplegia marin dispatch moon': 2065, 'marin dispatch moon candor': 1766, 'dispatch moon candor uniqu': 759, 'moon candor uniqu mission': 1857, 'candor uniqu mission becom': 432, 'uniqu mission becom torn': 2926, 'mission becom torn follow': 1843, 'becom torn follow order': 315, 'torn follow order protect': 2834, 'follow order protect alien': 1062, 'order protect alien civil': 2028, 'captain barbossa long believ': 445, 'barbossa long believ dead': 283, 'long believ dead come': 1696, 'believ dead come back': 332, 'dead come back life': 678, 'come back life head': 555, 'back life head edg': 266, 'life head edg earth': 1663, 'head edg earth will': 1276, 'edg earth will turner': 816, 'earth will turner elizabeth': 808, 'will turner elizabeth want': 3120, 'turner elizabeth want but': 2890, 'elizabeth want but noth': 827, 'want but noth quit': 3011, 'but noth quit seem': 410, 

In [110]:
n_grams.shape

(100, 3245)

In [111]:
n_grams=n_grams.toarray()

In [112]:
n_grams

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [113]:
n_grams[3]

array([0, 0, 0, ..., 0, 0, 0])

In [114]:
# Print each element of the array
for element in n_grams[3]:
        print(element, end=' ')

0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

## 4. Tf-Idf

In [115]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()

In [116]:
tfidf_encode=tfidf.fit_transform(df['overview'])

In [118]:
tfidf.get_feature_names_out()

array(['007', '10', '101yearold', ..., 'young', 'younger', 'youth'],
      dtype=object)

In [119]:
len(tfidf.get_feature_names_out())

1682

In [123]:
tfidf_encode=tfidf_encode.toarray()

In [125]:
tfidf_encode[3]

array([0., 0., 0., ..., 0., 0., 0.])

In [124]:
# Print each element of the array
for element in tfidf_encode[3]:
        print(element, end=' ')

0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.15450532675636877 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.13274679124516603 0.0 0.0 0.28355479882341106 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.26549358249033206 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.15450532675636877 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 