## ***Text Pre-processing Module***

---



In [12]:
import numpy as np
import pandas as pd
import re
import sys
import nltk
import spacy
import string

# Reading the dataset
df = pd.read_csv("Cricket.csv")
df.head()

Unnamed: 0,Title,Discription
0,Venue for Asia Cup 2020 to be decided soon,The Executive Board of the Asian Cricket Counc...
1,Stokes does not need extra captaincy pressure:...,Kevin Pietersen does not think Ben Stokes shou...
2,Archer urges victims to speak out against soci...,England fast bowler Jofra Archer has urged vic...
3,Using saliva won’t pose any risk once you’re i...,It seems unlikely cricket chiefs will row back...
4,Real-life 'Terminal': Footballer stuck at Mumb...,The Mumbai airport became home for a 23-year-o...


# **Lower Casing**

In [13]:
# Lower Casing --> creating new column called text_lower
df['Title_New']  = df['Title'].str.lower()
df['Title_New'].head()

df['Discription_New']  = df['Discription'].str.lower()
df['Discription'].head()

0    The Executive Board of the Asian Cricket Counc...
1    Kevin Pietersen does not think Ben Stokes shou...
2    England fast bowler Jofra Archer has urged vic...
3    It seems unlikely cricket chiefs will row back...
4    The Mumbai airport became home for a 23-year-o...
Name: Discription, dtype: object

# **Removal of Punctuations**

In [14]:
#removing punctuation, creating a new column called 'text_punct]'
df['Title_New'] = df['Title_New'].str.replace('[^\w\s]','')
df['Title_New'].head()
df['Discription_New'] = df['Discription_New'].str.replace('[^\w\s]','')
df['Discription_New'].head()

0    the executive board of the asian cricket counc...
1    kevin pietersen does not think ben stokes shou...
2    england fast bowler jofra archer has urged vic...
3    it seems unlikely cricket chiefs will row back...
4    the mumbai airport became home for a 23yearold...
Name: Discription_New, dtype: object

#**Stop-word removal**

In [15]:
#Importing stopwords from nltk library
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
# Function to remove the stopwords
def stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
# Applying the stopwords to 'text_punct' and store into 'text_stop'
df["Title_Stopwords"] = df["Title_New"].apply(stopwords)
df["Title_Stopwords"].head()
df["Discription_StopWords"] = df["Discription_New"].apply(stopwords)
df["Discription_StopWords"].head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0    executive board asian cricket council acc disc...
1    kevin pietersen think ben stokes captain engla...
2    england fast bowler jofra archer urged victims...
3    seems unlikely cricket chiefs row back saliva ...
4    mumbai airport became home 23yearold ghanaian ...
Name: Discription_StopWords, dtype: object

# **Spelling Correction**

In [17]:
from textblob import TextBlob
df['Title_Stopwords'][:].apply(lambda x: str(TextBlob(x).correct()))
df['Discription_StopWords'][:].apply(lambda x: str(TextBlob(x).correct()))

0     executive board asia cricket council act discu...
1     even peterson think ben stores captain england...
2     england fast bowler sofa arched urged victims ...
3     seems unlikely cricket chiefs row back saliva ...
4     lumbar airport became home 23yearold ghanaian ...
5     embraces cricket board confirmed expressed int...
6     former parisian captain was arm played era int...
7     day 1994 brian lata struck beaten 501the highe...
8     international cricket council ice friday said ...
9     england great geoffrey boycott announced frida...
10    former west indies captain warren same alleged...
11    west indies star batman chris gale expressing ...
12    fast bowler diam plunkett left england summer ...
13    england captain joe root said would miss first...
14    domestic cricket england august 1 earliest eng...
15    england play three tests home west indies july...
16    former west indies captain warren same urged c...
17    playing empty stadium ideal bigger challen

# **Emoji Removal**

In [18]:
# Function to remove emoji.
def remove_emoji(string):
  emoji_pattern = re.compile("["
                          u"\U0001F600-\U0001F64F"  # emoticons
                          u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                          u"\U0001F680-\U0001F6FF"  # transport & map symbols
                          u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                          u"\U00002702-\U000027B0"
                          u"\U000024C2-\U0001F251"
                          "]+", flags=re.UNICODE)
  return emoji_pattern.sub(r'', string)


df['Title_Stopwords'] = df['Title_Stopwords'].apply(remove_emoji)
