In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import os

In [2]:
df = pd.read_csv(os.path.join( 'Final_Project', 'Resources', 'FOMC20070131meeting.csv'))
df['polarity'] = df['Meeting of the Federal Open Market Committee on January 30–31, 2007'].apply(lambda x : 1 if x == 'positive' else 0)
class_names = ['negative', 'positive']

In [3]:
### nltk and string transformations
from nltk import tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import string

### sklearn
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, accuracy_score

In [4]:
import numpy as np
from numpy import random

SEED = 1234

random.seed(SEED)

In [5]:
### Define simple space tokenizer
translator = str.maketrans('', '', string.punctuation)
tk = tokenize.simple.SpaceTokenizer()

def simple_tokenizer(x):
    x = x.translate(translator).strip().lower()
    return tk.tokenize(x)

### Quick look at what the tokenizer produces
print(df['Meeting of the Federal Open Market Committee on January 30–31, 2007'].head().apply(lambda x : simple_tokenizer(x)))

### Developing the corpus or list of sentences for processing later
corpus = df['Meeting of the Federal Open Market Committee on January 30–31, 2007'].tolist()
corpus[0:5]

0    [a, meeting, of, the, federal, open, market, c...
1    [mr, bernanke, chairman\nmr, geithner, vice, c...
2    [mr, fisher, ms, pianalto, and, messrs, plosse...
3    [mr, lacker, and, ms, yellen, presidents, of, ...
4    [mr, barron, first, vice, president, federal, ...
Name: Meeting of the Federal Open Market Committee on January 30–31, 2007, dtype: object


['A meeting of the Federal Open Market Committee was held in the offices of the Board of Governors of the Federal Reserve System in Washington, D.C., on Tuesday, January 30, 2007, at 2:00 p.m., and continued on Wednesday, January 31, 2007, at 9:00 a.m.  Those present were the following:',
 'Mr. Bernanke, Chairman\nMr. Geithner, Vice Chairman Ms. Bies\nMr. Hoenig Mr. Kohn Mr. Kroszner Ms. Minehan Mr. Mishkin Mr. Moskow Mr. Poole Mr. Warsh',
 'Mr. Fisher, Ms. Pianalto, and Messrs. Plosser and Stern, Alternate Members of the Federal Open Market Committee',
 'Mr. Lacker and Ms. Yellen, Presidents of the Federal Reserve Banks of Richmond and San Francisco, respectively',
 'Mr. Barron, First Vice President, Federal Reserve Bank of Atlanta']

In [6]:
vectorizer = TfidfVectorizer(tokenizer=simple_tokenizer, stop_words='english')
tf = vectorizer.fit(corpus)
X = tf.transform(corpus)

### Combining result of TFIDF with `target` columns
full_df = pd.concat([pd.DataFrame(X.toarray()), df['polarity']], axis=1)
print("Vector features : ", vectorizer.get_feature_names()[0:20])
print("Vector shape/size : ",X.shape)

Vector features :  ['', '01', '02', '03', '03—i', '04', '05', '06', '08', '09', '1', '10', '100', '11', '116', '119000', '11—', '12', '120', '12¾']
Vector shape/size :  (4476, 5411)


In [10]:
target_col = 'polarity'
X_train, X_test, y_train, y_test = train_test_split(full_df.drop(target_col, axis=1)
                                                  ,y,test_size=0.25, random_state=SEED)
print(X_train.shape, y_train.shape)

NameError: name 'y' is not defined