![](2022-11-22-01-03-29.png)

# Data Cleaning and Vectorization For NLP

## Import Libraries

In [1]:
import pandas as pd
import numpy as np

# import warnings
# warnings.filterwarnings("ignore")
# pd.set_option('display.max_columns', 50)

![](2022-11-22-01-04-41.png)

## Tokenization

In [2]:
import nltk

In [3]:
sample_text = "Awesome!!!, This is fantastic!!!. We are very pleased...3456"

In [4]:
from nltk.tokenize import word_tokenize , sent_tokenize

In [5]:
# nltk.download('punkt')

In [6]:
word_token = word_tokenize(sample_text.lower())
sent_token = sent_tokenize(sample_text.lower())

In [7]:
word_token

['awesome',
 '!',
 '!',
 '!',
 ',',
 'this',
 'is',
 'fantastic',
 '!',
 '!',
 '!',
 '.',
 'we',
 'are',
 'very',
 'pleased',
 '...',
 '3456']

In [8]:
sent_token

['awesome!!', '!, this is fantastic!!!.', 'we are very pleased...3456']

## Removing Punctuation and Numbers

In [9]:
tokens_without_punc = [w for w in word_token if w.isalpha()] # .isal() for only alphabetics. # .isalnum() for number and object 
tokens_without_punc  # If you want to keep numbers, you should use .isalnum() rather than isalpha()

['awesome', 'this', 'is', 'fantastic', 'we', 'are', 'very', 'pleased']

In [10]:
tokens_without_punc = [w for w in word_token if w.isalnum()]
tokens_without_punc

['awesome', 'this', 'is', 'fantastic', 'we', 'are', 'very', 'pleased', '3456']

![](2022-11-22-01-18-33.png)

## Removing Stopwords

In [11]:
# nltk.download('stopwords')

In [12]:
from nltk.corpus import stopwords

In [13]:
stop_words = stopwords.words('english') # Our all stopwords.
print(stop_words)
print(len(stop_words))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [14]:
stop_words2 = stopwords.words('turkish') # Our all stopwords.
print(stop_words2)
print(len(stop_words2))

['acaba', 'ama', 'aslında', 'az', 'bazı', 'belki', 'biri', 'birkaç', 'birşey', 'biz', 'bu', 'çok', 'çünkü', 'da', 'daha', 'de', 'defa', 'diye', 'eğer', 'en', 'gibi', 'hem', 'hep', 'hepsi', 'her', 'hiç', 'için', 'ile', 'ise', 'kez', 'ki', 'kim', 'mı', 'mu', 'mü', 'nasıl', 'ne', 'neden', 'nerde', 'nerede', 'nereye', 'niçin', 'niye', 'o', 'sanki', 'şey', 'siz', 'şu', 'tüm', 've', 'veya', 'ya', 'yani']
53


In [15]:
tokens_without_punc

['awesome', 'this', 'is', 'fantastic', 'we', 'are', 'very', 'pleased', '3456']

In [16]:
tokens_without_sw = [w for w in tokens_without_punc if w not in stop_words]# if you make a sentiment analysis , you can't remove 
                                                                           # negative auxiliary verb
tokens_without_sw

['awesome', 'fantastic', 'pleased', '3456']

![](2022-11-22-01-39-47.png)

## Data Normalization-Lemmatization

In [17]:
from nltk.stem import WordNetLemmatizer

In [18]:
# nltk.download('wordnet')

In [19]:
# nltk.download('omw-1.4')

In [20]:
WordNetLemmatizer().lemmatize("children")

'child'

In [21]:
WordNetLemmatizer().lemmatize("runs" , pos = 'n')  # 'v' mean verbs. n mean nouns.

'run'

In [22]:
lem = [WordNetLemmatizer().lemmatize(t) for t in tokens_without_sw]
lem

['awesome', 'fantastic', 'pleased', '3456']

![](2022-11-22-01-57-41.png)

## Data Normalization-Stemming

In [23]:
from nltk.stem import PorterStemmer

In [24]:
PorterStemmer().stem('development')

'develop'

In [25]:
stem = [PorterStemmer().stem(t) for t in tokens_without_sw]
stem

['awesom', 'fantast', 'pleas', '3456']

## Joining

In [26]:
" ".join(lem)

'awesome fantastic pleased 3456'

## Cleaning Function - for classification (NOT for sentiment analysis)

In [27]:
def cleaning(data):
    
    #1. Tokenize
    text_tokens = word_tokenize(data.lower()) 
    
    #2. Remove Puncs
    tokens_without_punc = [w for w in text_tokens if w.isalpha()]
    
    #3. Removing Stopwords
    tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words]
    
    #4. lemma
    text_cleaned = [WordNetLemmatizer().lemmatize(t) for t in tokens_without_sw]
    
    #joining
    return " ".join(text_cleaned)

In [28]:
pd.Series(sample_text).apply(cleaning)

0    awesome fantastic pleased
dtype: object

## Cleaning Function - for sentiment analysis

In [29]:
sample_text= "Awesome!!!, This is fantastic!!!. We are very pleased...3456. don't eat, isn't. no problem for me"

In [30]:
s = sample_text.replace("'",'')
word = word_tokenize(s)
word 

['Awesome',
 '!',
 '!',
 '!',
 ',',
 'This',
 'is',
 'fantastic',
 '!',
 '!',
 '!',
 '.',
 'We',
 'are',
 'very',
 'pleased',
 '...',
 '3456.',
 'dont',
 'eat',
 ',',
 'isnt',
 '.',
 'no',
 'problem',
 'for',
 'me']

In [31]:
# Removing Stopwords
for i in ["not", "no"]:
    stop_words.remove(i)

def cleaning_fsa(data):
    
    
    #1. removing upper brackets to keep negative auxiliary verbs in text
    text = data.replace("'",'')
         
    #2. Tokenize
    text_tokens = word_tokenize(text.lower()) 
    
    #3. Remove numbers
    tokens_without_punc = [w for w in text_tokens if w.isalpha()]
    
    
        
    tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words]
    
    #4. lemma
    text_cleaned = [WordNetLemmatizer().lemmatize(t) for t in tokens_without_sw]
    
    #joining
    return " ".join(text_cleaned)

In [32]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [33]:
pd.Series(sample_text).apply(cleaning_fsa)

0    awesome fantastic pleased dont eat isnt no pro...
dtype: object

## CountVectorization and TF-IDF Vectorization

In [34]:
df = pd.read_csv("airline_tweets.csv")
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [35]:
df = df[['airline_sentiment','text']]
df

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...
...,...,...
14635,positive,@AmericanAir thank you we got on a different f...
14636,negative,@AmericanAir leaving over 20 minutes Late Flig...
14637,neutral,@AmericanAir Please bring American Airlines to...
14638,negative,"@AmericanAir you have my money, you change my ..."


In [36]:
df.head(8)

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...
5,negative,@VirginAmerica seriously would pay $30 a fligh...
6,positive,"@VirginAmerica yes, nearly every time I fly VX..."
7,neutral,@VirginAmerica Really missed a prime opportuni...


In [37]:
df2 = df.copy()

In [38]:
df2["text"] = df2["text"].apply(cleaning)
df2.head(8)

Unnamed: 0,airline_sentiment,text
0,neutral,virginamerica dhepburn said
1,positive,virginamerica plus added commercial experience...
2,neutral,virginamerica today must mean need take anothe...
3,negative,virginamerica really aggressive blast obnoxiou...
4,negative,virginamerica really big bad thing
5,negative,virginamerica seriously would pay flight seat ...
6,positive,virginamerica yes nearly every time fly vx ear...
7,neutral,virginamerica really missed prime opportunity ...


![](2022-11-22-02-08-45.png)

## CountVectorization

In [39]:
X = df2["text"]
y = df2["airline_sentiment"]

In [40]:
from sklearn.model_selection import train_test_split

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, stratify = y, random_state = 53)

In [42]:
from sklearn.feature_extraction.text import CountVectorizer

In [43]:
vectorizer = CountVectorizer()

X_train_count = vectorizer.fit_transform(X_train)   # Only fitting with X_train as we do at fitting scaler.
X_test_count = vectorizer.transform(X_test)     # If the word in X test is not in X train, it is ignored.
# That's why its needs to train in the big Corpus so that we can get good results 

In [48]:
vectorizer.get_feature_names_out()

array(['aa', 'aaaand', 'aaadvantage', ..., 'zrh', 'zukes', 'zurich'],
      dtype=object)

In [49]:
X_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [53]:
df_count = pd.DataFrame(X_train_count.toarray(), columns = vectorizer.get_feature_names_out())
df_count

Unnamed: 0,aa,aaaand,aaadvantage,aaalwayslate,aacustomerservice,aadavantage,aadv,aadvantage,aafail,aaron,...,yyz,zabsonre,zambia,zero,zfv,zkatcher,zone,zrh,zukes,zurich
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7315,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7316,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7317,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7318,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [54]:
X_train

2316                united also checked email file correct
2648     united hear guitar damaged december use guitar...
12396                     americanair yes bit follow ca dm
7186     jetblue believe irina super disappointed eatup...
13875    americanair seriously not want wait hour fligh...
                               ...                        
5056     southwestair give info flt bdl see cancelled f...
3710     united think guy half full flight held overboo...
5544                  southwestair got taken care thank lt
13638    americanair visiting sju returning paris check...
7421     jetblue domestic clear not sit lovely terminal...
Name: text, Length: 7320, dtype: object

In [58]:
X_train[5]

'virginamerica seriously would pay flight seat playing really bad thing flying va'

In [59]:
vectorizer.vocabulary_

{'united': 6782,
 'also': 210,
 'checked': 1064,
 'email': 2039,
 'file': 2382,
 'correct': 1393,
 'hear': 2945,
 'guitar': 2852,
 'damaged': 1562,
 'december': 1609,
 'use': 6878,
 'earn': 1974,
 'living': 3750,
 'get': 2701,
 'act': 66,
 'together': 6543,
 'americanair': 234,
 'yes': 7258,
 'bit': 678,
 'follow': 2512,
 'ca': 892,
 'dm': 1853,
 'jetblue': 3446,
 'believe': 619,
 'irina': 3376,
 'super': 6231,
 'disappointed': 1784,
 'eatup': 1992,
 'cafe': 900,
 'available': 472,
 'bos': 754,
 'gt': 2838,
 'sfo': 5738,
 'flight': 2442,
 'today': 6541,
 'seriously': 5716,
 'not': 4362,
 'want': 7016,
 'wait': 6991,
 'hour': 3090,
 'prepare': 4939,
 'family': 2308,
 'funeral': 2642,
 'usairways': 6865,
 'worst': 7189,
 'airline': 150,
 'alfamilyoffour': 182,
 'maybe': 3958,
 'anyone': 300,
 'answering': 288,
 'phone': 4750,
 'would': 7201,
 'please': 4818,
 'call': 910,
 'back': 512,
 'late': 3627,
 'flightr': 2463,
 'good': 2763,
 'enough': 2090,
 'made': 3864,
 'miss': 4093,
 'first'

![](2022-11-22-02-17-27.png)

## TF-IDF

In [60]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [61]:
tf_idf_vectorizer = TfidfVectorizer()

X_train_tf_idf = tf_idf_vectorizer.fit_transform(X_train)   # When we say fit, it first determines whether it is passed in each line and then the number of passes in each document.
X_test_tf_idf = tf_idf_vectorizer.transform(X_test)     # Applies the TF-IDF Formula.

In [63]:
tf_idf_vectorizer.get_feature_names_out()

array(['aa', 'aaaand', 'aaadvantage', ..., 'zrh', 'zukes', 'zurich'],
      dtype=object)

In [64]:
X_train_tf_idf.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [66]:
df_tfidf = pd.DataFrame(X_train_tf_idf.toarray(), columns = tf_idf_vectorizer.get_feature_names_out())
df_tfidf

Unnamed: 0,aa,aaaand,aaadvantage,aaalwayslate,aacustomerservice,aadavantage,aadv,aadvantage,aafail,aaron,...,yyz,zabsonre,zambia,zero,zfv,zkatcher,zone,zrh,zukes,zurich
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7315,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7318,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [67]:
X_train[5]

'virginamerica seriously would pay flight seat playing really bad thing flying va'

In [68]:
df_tfidf.loc[2].sort_values(ascending = False)

bit         0.565101
follow      0.449861
yes         0.401677
dm          0.397842
ca          0.341823
              ...   
fleek       0.000000
flawless    0.000000
flaw        0.000000
flavor      0.000000
zurich      0.000000
Name: 2, Length: 7300, dtype: float64

In [69]:
X_test[3]

'virginamerica really aggressive blast obnoxious entertainment guest face amp little recourse'

In [70]:
df_tfidf = pd.DataFrame(X_train_tf_idf.toarray(), columns = tf_idf_vectorizer.get_feature_names_out())
df_tfidf

Unnamed: 0,aa,aaaand,aaadvantage,aaalwayslate,aacustomerservice,aadavantage,aadv,aadvantage,aafail,aaron,...,yyz,zabsonre,zambia,zero,zfv,zkatcher,zone,zrh,zukes,zurich
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7315,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7318,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [71]:
# Done !