In [1]:
import pandas as pd
import numpy as np

In [2]:
df= pd.read_csv('/content/test_data_cleaning.csv')

In [3]:
df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about # earthquake is different cities,..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting . # Spokane # wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [4]:
df.shape

(3263, 4)

**TEXT PREPROCESSING**

**LOWERCASING**

In [6]:
df['text'].str.lower()

0                      just happened a terrible car crash
1       heard about  # earthquake is different cities,...
2       there is a forest fire at spot pond, geese are...
3          apocalypse lighting .   # spokane  # wildfires
4           typhoon soudelor kills 28 in china and taiwan
                              ...                        
3258    earthquake safety los angeles  safety fastener...
3259    storm in ri worse than last hurricane .  my ci...
3260                    green line derailment in chicago 
3261      meg issues hazardous weather outlook  ( hwo )  
3262     # city of calgary has activated its municipal...
Name: text, Length: 3263, dtype: object

**REMOVING HTML TAGS**

In [7]:
import re 
def remove_html_tags(text):
  pattern= re.compile('<.*?>')
  return pattern.sub(r'',text)

In [8]:
df['text'].apply(remove_html_tags)

0                      Just happened a terrible car crash
1       Heard about  # earthquake is different cities,...
2       there is a forest fire at spot pond, geese are...
3          Apocalypse lighting .   # Spokane  # wildfires
4           Typhoon Soudelor kills 28 in China and Taiwan
                              ...                        
3258    EARTHQUAKE SAFETY LOS ANGELES  SAFETY FASTENER...
3259    Storm in RI worse than last hurricane .  My ci...
3260                    Green Line derailment in Chicago 
3261      MEG issues Hazardous Weather Outlook  ( HWO )  
3262     # City of Calgary has activated its Municipal...
Name: text, Length: 3263, dtype: object

**REMOVING STOP WORDS**

In [12]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
from nltk.corpus import stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [14]:
def remove_stopwords(text):
  new_text=[]
  for word in text.split():
    if word in stopwords.words('english'):
      new_text.append('')
    else:
        new_text.append(word)
        x= new_text[:]
        new_text.clear()
        return " ".join(x)
        

In [15]:
df['text'].apply(remove_stopwords)

0             Just
1            Heard
2           forest
3       Apocalypse
4          Typhoon
           ...    
3258    EARTHQUAKE
3259         Storm
3260         Green
3261           MEG
3262             #
Name: text, Length: 3263, dtype: object

**FEATURE EXTRACTION**

**BAG OF WORDS OR UNIGRAM TECHNIQUE**

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
cv= CountVectorizer()

In [20]:
bow= cv.fit_transform(df['text'])

In [22]:
print(cv.vocabulary_)



In [23]:
print(bow[0].toarray())
print(bow[1].toarray())

[[0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]]


**BAG OF NGRAMS**

**BIGRAMS**

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

In [26]:
cv= CountVectorizer(ngram_range=(2,2))

In [27]:
bow= cv.fit_transform(df['text'])

In [30]:
print(cv.vocabulary_)



In [35]:
print(len(cv.vocabulary_))

33455


**TRIGRAM**

In [31]:
cv= CountVectorizer(ngram_range=(3,3))

In [32]:
bow= cv.fit_transform(df['text'])

In [33]:
print(cv.vocabulary_)



In [37]:
print(len(cv.vocabulary_))

33455


**TFIDF- TERM FREQUENCY AND INVERSE DOCUMENT FREQUENCY**

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [39]:
tfidf= TfidfVectorizer()

In [40]:
tfidf.fit_transform(df['text']).toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [41]:
print(tfidf.idf_)

[6.25749537 7.99209643 8.39756154 ... 7.70441435 8.39756154 8.39756154]


In [42]:
print(tfidf.get_feature_names_out())

['00' '000' '00am' ... 'ìàekdar' 'ìñ2' 'û¼']
