In [1]:
import pandas as pd 
import numpy as np

In [2]:
df = pd.read_csv('/content/train_data_cleaning.csv')

In [3]:
df.head

<bound method NDFrame.head of          id keyword location  \
0         1     NaN      NaN   
1         4     NaN      NaN   
2         5     NaN      NaN   
3         6     NaN      NaN   
4         7     NaN      NaN   
...     ...     ...      ...   
7608  10869     NaN      NaN   
7609  10870     NaN      NaN   
7610  10871     NaN      NaN   
7611  10872     NaN      NaN   
7612  10873     NaN      NaN   

                                                   text  target  
0     Our Deeds are the Reason of this  # earthquake...       1  
1              Forest fire near La Ronge Sask .  Canada       1  
2     All residents asked to  ' shelter in place '  ...       1  
3     13,000 people receive  # wildfires evacuation ...       1  
4     Just got sent this photo from Ruby  # Alaska a...       1  
...                                                 ...     ...  
7608  Two giant cranes holding a bridge collapse int...       1  
7609   @ Aria Ahrary  @ TheTawniest The out of contr...  

In [5]:
df.shape

(7613, 5)

**LOWERCASING THE COLUMNS**

In [6]:
df['text'].str.lower()

0       our deeds are the reason of this  # earthquake...
1                forest fire near la ronge sask .  canada
2       all residents asked to  ' shelter in place '  ...
3       13,000 people receive  # wildfires evacuation ...
4       just got sent this photo from ruby  # alaska a...
                              ...                        
7608    two giant cranes holding a bridge collapse int...
7609     @ aria ahrary  @ thetawniest the out of contr...
7610    m1 . 94  [ 01 : 04 utc ]  ? 5km s of volcano h...
7611    police investigating after an e - bike collide...
7612    the latest :  more homes razed by northern cal...
Name: text, Length: 7613, dtype: object

In [7]:
df['location'].str.lower()

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
       ... 
7608    NaN
7609    NaN
7610    NaN
7611    NaN
7612    NaN
Name: location, Length: 7613, dtype: object

In [8]:
df['keyword'].str.lower()

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
       ... 
7608    NaN
7609    NaN
7610    NaN
7611    NaN
7612    NaN
Name: keyword, Length: 7613, dtype: object

**REMOVING HTML TAGS USING REGEX**

In [10]:
import re
def remove_html_tags(text):
  pattern = re.compile('<.*?>')
  return pattern.sub(r'',text)

In [11]:
df['text'] = df['text'].apply(remove_html_tags)

In [12]:
df['text']

0       Our Deeds are the Reason of this  # earthquake...
1                Forest fire near La Ronge Sask .  Canada
2       All residents asked to  ' shelter in place '  ...
3       13,000 people receive  # wildfires evacuation ...
4       Just got sent this photo from Ruby  # Alaska a...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609     @ Aria Ahrary  @ TheTawniest The out of contr...
7610    M1 . 94  [ 01 : 04 UTC ]  ? 5km S of Volcano H...
7611    Police investigating after an e - bike collide...
7612    The Latest :  More Homes Razed by Northern Cal...
Name: text, Length: 7613, dtype: object

**REMOVING PUNCTUATIONS**

In [13]:
import string

In [14]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [15]:
exclude= string.punctuation

In [16]:
def remove_punc(text):
  for char in exclude:
    text= text.replace(char,'')
    return text

In [17]:
text= 'string with. punctuation?'

**CHAT WORDS TREATMENT**

In [20]:
def chat_words(text):
  new_text=[]
  for w in text.split():
    if w.upper() in chat_words:
      new_text.append(chat_words[w.upper()])
    else:
        new_text.append(w)
        return " ".join(new_text)

In [26]:
import nltk

In [27]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [28]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [29]:
def remove_stopwords(text):
  new_text = []
  for word in text.split():
    if word in stopwords.words('english'):
      new_text.append('')
    else:
      new_text.append(word)
      x= new_text[:]
      new_text.clear()
      return " ".join(x)

In [30]:
df['text'].apply(remove_stopwords)

0          Our
1       Forest
2          All
3       13,000
4         Just
         ...  
7608       Two
7609         @
7610        M1
7611    Police
7612       The
Name: text, Length: 7613, dtype: object

**TOKENIZATION**

In [34]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [37]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [38]:
sent1= 'I am going to visit new delhi!'
word_tokenize(sent1)

['I', 'am', 'going', 'to', 'visit', 'new', 'delhi', '!']

**STEMMING**

In [39]:
from nltk.stem.porter import PorterStemmer

In [40]:
ps= PorterStemmer()
def stem_words(text):
  return " ".join([ps.stem(word)for word in text.split()])

In [42]:
sample="walks walked walking"
stem_words(sample)

'walk walk walk'