## Count Vectorizer and TFIDF on Inaugural_Speeches Data 

In [1]:
import pandas as pd
import numpy as np

In [2]:
#Importing Data
speech_df=pd.read_csv('inaugural_speeches.csv')
speech_df.head()

Unnamed: 0,Name,Inaugural Address,Date,text
0,George Washington,First Inaugural Address,"Thursday, April 30, 1789",Fellow-Citizens of the Senate and of the House...
1,George Washington,Second Inaugural Address,"Monday, March 4, 1793",Fellow Citizens: I AM again called upon by th...
2,John Adams,Inaugural Address,"Saturday, March 4, 1797","WHEN it was first perceived, in early times, t..."
3,Thomas Jefferson,First Inaugural Address,"Wednesday, March 4, 1801",Friends and Fellow-Citizens: CALLED upon to u...
4,Thomas Jefferson,Second Inaugural Address,"Monday, March 4, 1805","PROCEEDING, fellow-citizens, to that qualifica..."


In [3]:
# Replace all non letter characters with a whitespace.
speech_df['text_clean'] = speech_df['text'].str.replace('[^a-zA-Z]',' ')
# Change to lower case
speech_df['text_clean']=speech_df['text_clean'].str.lower()
# Print the first 5 rows of the text_clean column
speech_df['text_clean'].head()

0    fellow citizens of the senate and of the house...
1    fellow citizens   i am again called upon by th...
2    when it was first perceived  in early times  t...
3    friends and fellow citizens   called upon to u...
4    proceeding  fellow citizens  to that qualifica...
Name: text_clean, dtype: object

In [4]:
# Find the length of each text
speech_df['text_clean'].str.len()
# Count the number of words in each text
speech_df['text_clean'].str.split().str.len()
# Find the average length of word
speech_df['text_clean'].str.split().str.len()/speech_df['text_clean'].str.len()

0     0.166202
1     0.171537
2     0.167472
3     0.171136
4     0.168113
5     0.168356
6     0.169418
7     0.170001
8     0.169668
9     0.164592
10    0.165738
11    0.166974
12    0.166277
13    0.170764
14    0.167751
15    0.165480
16    0.166658
17    0.168787
18    0.173165
19    0.179461
20    0.174513
21    0.173475
22    0.166878
23    0.168223
24    0.166913
25    0.168061
26    0.164344
27    0.167996
28    0.165053
29    0.178077
30    0.169123
31    0.179192
32    0.182695
33    0.164975
34    0.169403
35    0.164235
36    0.173052
37    0.172265
38    0.178655
39    0.185677
40    0.167214
41    0.176748
42    0.181955
43    0.180379
44    0.182237
45    0.183916
46    0.182623
47    0.178541
48    0.178211
49    0.177111
50    0.186822
51    0.176335
52    0.177873
53    0.176794
54    0.173949
55    0.179805
56    0.176688
57    0.173933
Name: text_clean, dtype: float64

In [5]:
#Counting words (I)
# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# Instantiate CountVectorizer
cv=CountVectorizer()
# Fit the vectorizer
cv.fit(speech_df['text_clean'])
# Print feature names
print(cv.get_feature_names())
type(cv.get_feature_names())

['abandon', 'abandoned', 'abandonment', 'abate', 'abdicated', 'abeyance', 'abhorring', 'abide', 'abiding', 'abilities', 'ability', 'abject', 'able', 'ably', 'abnormal', 'abode', 'abolish', 'abolished', 'abolishing', 'aboriginal', 'aborigines', 'abound', 'abounding', 'abounds', 'about', 'above', 'abraham', 'abreast', 'abridging', 'abroad', 'absence', 'absent', 'absolute', 'absolutely', 'absolutism', 'absorb', 'absorbed', 'absorbing', 'absorbs', 'abstain', 'abstaining', 'abstract', 'abstractions', 'absurd', 'abundance', 'abundant', 'abundantly', 'abuse', 'abused', 'abuses', 'academies', 'accept', 'acceptance', 'accepted', 'accepting', 'accepts', 'access', 'accessible', 'accession', 'accident', 'accidental', 'accidents', 'acclaim', 'accommodation', 'accommodations', 'accompanied', 'accompany', 'accomplish', 'accomplished', 'accomplishing', 'accomplishment', 'accomplishments', 'accord', 'accordance', 'accorded', 'according', 'accordingly', 'accords', 'account', 'accountability', 'accountab

list

In [6]:
#Counting words (II)
# Apply the vectorizer
cv.transform(speech_df['text_clean']) # This will give Sparse array
# Print the full array
cv.transform(speech_df['text_clean']).toarray()
# Print the shape of cv_array
cv.transform(speech_df['text_clean']).toarray().shape

(58, 9043)

In [7]:
#Limiting your features
# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# Specify arguements to limit the number of features generated
cv=CountVectorizer(min_df=0.1,max_df=0.9)
# Fit, transform, and convert into array
cv.fit_transform(speech_df['text_clean']).toarray()
# Print the array shape
cv_array=cv.fit_transform(speech_df['text_clean']).toarray()

In [8]:
#Text to DataFrame
# Create a DataFrame with these features
new_text=pd.DataFrame(cv_array,columns=cv.get_feature_names())
# Add the new columns to the original DataFrame
pd.concat([speech_df,new_text],axis=1).head()

Unnamed: 0,Name,Inaugural Address,Date,text,text_clean,abandon,abiding,ability,able,about,...,year,years,yes,yet,yield,you,young,your,zeal,zealously
0,George Washington,First Inaugural Address,"Thursday, April 30, 1789",Fellow-Citizens of the Senate and of the House...,fellow citizens of the senate and of the house...,0,0,0,0,0,...,0,1,0,0,0,5,0,9,0,0
1,George Washington,Second Inaugural Address,"Monday, March 4, 1793",Fellow Citizens: I AM again called upon by th...,fellow citizens i am again called upon by th...,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,John Adams,Inaugural Address,"Saturday, March 4, 1797","WHEN it was first perceived, in early times, t...",when it was first perceived in early times t...,0,0,0,0,0,...,2,3,0,0,0,0,0,1,1,0
3,Thomas Jefferson,First Inaugural Address,"Wednesday, March 4, 1801",Friends and Fellow-Citizens: CALLED upon to u...,friends and fellow citizens called upon to u...,1,0,0,0,1,...,0,0,0,2,0,7,0,7,1,0
4,Thomas Jefferson,Second Inaugural Address,"Monday, March 4, 1805","PROCEEDING, fellow-citizens, to that qualifica...",proceeding fellow citizens to that qualifica...,0,0,0,1,0,...,2,2,0,2,0,4,0,4,3,0


In [9]:
#TFIDF
# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Instantiate TfidfVectorizer
TF=TfidfVectorizer(max_features=100,stop_words='english')
# Fit the vectroizer and transform the data
TF_arr=TF.fit_transform(speech_df['text_clean']).toarray()
# Create a DataFrame with these features
TF_df=pd.DataFrame(TF_arr,columns=TF.get_feature_names())
pd.concat([speech_df['text_clean'],TF_df],axis=1).shape

(58, 101)

In [10]:
train_speech_df=speech_df[['text_clean']].iloc[:40,:]
test_speech_df=speech_df[['text_clean']].iloc[40:,:]

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
TF=TfidfVectorizer(max_features=10,stop_words='english')
TF.fit_transform(train_speech_df['text_clean'])
tf_transform=TF.transform(test_speech_df['text_clean']).toarray()
tf_transform_df=pd.DataFrame(tf_transform,columns=TF.get_feature_names())
pd.concat([test_speech_df,tf_transform_df],axis=1).head()

Unnamed: 0,text_clean,constitution,country,government,great,nation,people,power,public,shall,states
0,,0.0,0.201047,0.201047,0.201047,0.140766,0.784773,0.077719,0.0,0.326989,0.360688
1,,0.062166,0.384719,0.05496,0.164879,0.230886,0.589963,0.254951,0.0,0.589963,0.05916
2,,0.0,0.075311,0.0,0.376557,0.316383,0.734932,0.436698,0.0,0.146986,0.0
3,,0.0,0.435323,0.0,0.108831,0.228599,0.106203,0.631062,0.0,0.531017,0.234297
4,,0.0,0.0,0.060667,0.303336,0.764589,0.532824,0.0,0.065304,0.177608,0.0
