In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('movies.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,overview
0,0,The Shawshank Redemption,Framed in the 1940s for the double murder of h...
1,1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o..."
2,2,The Godfather Part II,In the continuing saga of the Corleone crime f...
3,3,Schindler's List,The true story of how businessman Oskar Schind...
4,4,12 Angry Men,The defense and the prosecution have rested an...


In [4]:
df = df.drop(columns=['Unnamed: 0'])

In [5]:
df = df.rename(columns={'title': 'movie_name', 'overview': 'description'})

In [6]:
df.head()

Unnamed: 0,movie_name,description
0,The Shawshank Redemption,Framed in the 1940s for the double murder of h...
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o..."
2,The Godfather Part II,In the continuing saga of the Corleone crime f...
3,Schindler's List,The true story of how businessman Oskar Schind...
4,12 Angry Men,The defense and the prosecution have rested an...


In [7]:
df.shape

(9268, 2)

In [8]:
df.sample(4)

Unnamed: 0,movie_name,description
7297,Extinction,"A chief mechanic at a factory, haunted by apoc..."
267,Requiem for a Dream,The hopes and dreams of four ambitious people ...
440,Little Women,Four sisters come of age in America in the aft...
5993,Delicacy,A French woman mourning over the death of her ...


In [9]:
df.isnull().sum()

movie_name     0
description    1
dtype: int64

In [10]:
df.dropna(inplace=True)

In [11]:
df.isnull().sum()

movie_name     0
description    0
dtype: int64

# Text Preprocessing

## 1. Lowercasing

In [12]:
df['movie_name'] = df['movie_name'].str.lower()

In [13]:
df['description'] = df['description'].str.lower()

In [14]:
df.head()

Unnamed: 0,movie_name,description
0,the shawshank redemption,framed in the 1940s for the double murder of h...
1,the godfather,"spanning the years 1945 to 1955, a chronicle o..."
2,the godfather part ii,in the continuing saga of the corleone crime f...
3,schindler's list,the true story of how businessman oskar schind...
4,12 angry men,the defense and the prosecution have rested an...


## 2. Removing Punctuation

In [15]:
import string

In [16]:
exclude = string.punctuation
exclude

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [17]:
def remove_punc(text):
    return text.translate(str.maketrans('', '', exclude))

In [18]:
remove_punc("Hello! my name is professor, and [what's yours ?]")

'Hello my name is professor and whats yours '

## 3. Text Correction

In [19]:
from textblob import TextBlob

In [20]:
df1 = df.iloc[:20]

In [21]:
df1.head()

Unnamed: 0,movie_name,description
0,the shawshank redemption,framed in the 1940s for the double murder of h...
1,the godfather,"spanning the years 1945 to 1955, a chronicle o..."
2,the godfather part ii,in the continuing saga of the corleone crime f...
3,schindler's list,the true story of how businessman oskar schind...
4,12 angry men,the defense and the prosecution have rested an...


In [22]:
corrected_descriptions = []
for description in df1['description']:
    text_blob = TextBlob(str(description))
    corrected_text = text_blob.correct()
    corrected_descriptions.append(corrected_text)

df1['description'] = corrected_descriptions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['description'] = corrected_descriptions


# Removing Stopwords

In [23]:
from nltk.corpus import stopwords

In [24]:
df['description'][0]

'framed in the 1940s for the double murder of his wife and her lover, upstanding banker andy dufresne begins a new life at the shawshank prison, where he puts his accounting skills to work for an amoral warden. during his long stretch in prison, dufresne comes to be admired by the other inmates -- including an older prisoner named red -- for his integrity and unquenchable sense of hope.'

In [25]:
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [26]:
df['description'] = df['description'].apply(remove_stopwords)

In [27]:
df['description'][0]

'framed   1940s   double murder   wife   lover, upstanding banker andy dufresne begins  new life   shawshank prison,   puts  accounting skills  work   amoral warden.   long stretch  prison, dufresne comes   admired    inmates -- including  older prisoner named red --   integrity  unquenchable sense  hope.'

# Tokenization

In [28]:
from nltk.tokenize import word_tokenize, sent_tokenize

### Word Tokenization

In [29]:
def word_tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

In [30]:
df['description'] = df['description'].apply(word_tokenize_text)

In [31]:
df.head()

Unnamed: 0,movie_name,description
0,the shawshank redemption,"[framed, 1940s, double, murder, wife, lover, ,..."
1,the godfather,"[spanning, years, 1945, 1955, ,, chronicle, fi..."
2,the godfather part ii,"[continuing, saga, corleone, crime, family, ,,..."
3,schindler's list,"[true, story, businessman, oskar, schindler, s..."
4,12 angry men,"[defense, prosecution, rested, jury, filing, j..."


In [32]:
df['description'][1]

['spanning',
 'years',
 '1945',
 '1955',
 ',',
 'chronicle',
 'fictional',
 'italian-american',
 'corleone',
 'crime',
 'family',
 '.',
 'organized',
 'crime',
 'family',
 'patriarch',
 ',',
 'vito',
 'corleone',
 'barely',
 'survives',
 'attempt',
 'life',
 ',',
 'youngest',
 'son',
 ',',
 'michael',
 'steps',
 'take',
 'care',
 'would-be',
 'killers',
 ',',
 'launching',
 'campaign',
 'bloody',
 'revenge',
 '.']

### Sentence Tokenization

In [33]:
def sentence_tokenize_text(text):
    sentences = sent_tokenize(text)
    return sentences

In [34]:
text1 = """ Lorem! ipsum dolor sit amet, consectetur? adipiscing! elit. Pellentesque? /ultricies ipsum vitae congue \vehicula. Phasellus ultrices pretium justo. Morbi placerat quam lacus, a hendrerit est condimentum eu. Quisque pretium ut ipsum ac venenatis."""
sent_tokenize(text1)

[' Lorem!',
 'ipsum dolor sit amet, consectetur?',
 'adipiscing!',
 'elit.',
 'Pellentesque?',
 '/ultricies ipsum vitae congue \x0behicula.',
 'Phasellus ultrices pretium justo.',
 'Morbi placerat quam lacus, a hendrerit est condimentum eu.',
 'Quisque pretium ut ipsum ac venenatis.']

In [35]:
import spacy

In [36]:
sent1 = 'I have a Ph.D in A.I'
sent2 = "We're here to help! mail us at nks@gmail.com"
sent3 = 'A 5km ride cost $10.50'

In [37]:
nlp = spacy.load('en_core_web_sm')

In [38]:
doc1 = nlp(sent1)
doc2 = nlp(sent2)
doc3 = nlp(sent3)

In [39]:
for token in doc1:
    print(token)

I
have
a
Ph
.
D
in
A.I


In [40]:
for token in doc2:
    print(token)

We
're
here
to
help
!
mail
us
at
nks@gmail.com


In [41]:
for token in doc3:
    print(token)

A
5
km
ride
cost
$
10.50


## Stemming

In [42]:
from nltk.stem.porter import PorterStemmer

In [43]:
ps = PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [44]:
sample = "go going goes"
stem_words(sample)

'go go goe'

## Lemmatization

In [45]:
import nltk
from nltk.stem import WordNetLemmatizer

In [46]:
wordnet_lemmatizer = WordNetLemmatizer()

In [47]:
sentence = "He was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun."

punctuations = "?:!.,;"
sentence_words = nltk.word_tokenize(sentence)

for word in sentence_words:
    if word in punctuations:
        sentence_words.remove(word)

sentence_words

print("{0:20}{1:20}".format("Word", "Lemma"))
for word in sentence_words:
    print("{0:20}{1:20}".format(word, wordnet_lemmatizer.lemmatize(word, pos='v')))

Word                Lemma               
He                  He                  
was                 be                  
running             run                 
and                 and                 
eating              eat                 
at                  at                  
same                same                
time                time                
He                  He                  
has                 have                
bad                 bad                 
habit               habit               
of                  of                  
swimming            swim                
after               after               
playing             play                
long                long                
hours               hours               
in                  in                  
the                 the                 
Sun                 Sun                 


## Removing HTML Tags

In [48]:
import re

In [49]:
text = "<p>I love <strong>cars</strong> because i am a <em>car enthusiast</em>.</p>"

In [50]:
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

In [51]:
remove_html_tags(text)

'I love cars because i am a car enthusiast.'

## Removing URLs

In [52]:
url1 = "For more information, visit our website at https://www.example.com."
url2 = "The article can be found online at http://www.example.com/article123."

In [53]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

In [54]:
print(remove_url(url1))
print(remove_url(url2))

For more information, visit our website at 
The article can be found online at 


## Handling Emojis

In [55]:
import emoji

In [56]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                               "\U0001F600-\U0001F64F"  # Emojis and emoticons
                               "\U0001F300-\U0001F5FF"  # Miscellaneous symbols and pictographs
                               "\U0001F680-\U0001F6FF"  # Transport and map symbols
                               "\U0001F1E0-\U0001F1FF"  # Flags (iOS)
                               "\U00002702-\U000027B0"  # Miscellaneous symbols
                               "\U000024C2-\U0001F251"  # Enclosed characters
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [57]:
remove_emoji("My name is professor.💀💀")

'My name is professor.'

In [58]:
em = emoji.demojize('Python is 🐍')
print(em)

Python is :snake:
