## Import Packages

In [1]:
import nltk
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## Import Data

In [2]:
dem = pd.read_csv("../datasets/US-Party-Dem_Platform_19.2.csv")
rep = pd.read_csv("../datasets/US-Party-Rep_Platform_19.2..csv")

### Get recent party platforms

In [3]:
dem = dem.loc[dem["year"] > 1990]
rep = rep.loc[rep["year"] > 1990]

### Drop Inconsistent Columns

In [4]:
dem.columns

Index(['year', 'id', 'pap_majortopic', 'pap_subtopic', 'majortopic',
       'subtopic', 'description', 'filter_democrat', 'words'],
      dtype='object')

In [5]:
rep.columns

Index(['year', 'id', 'pap_majortopic', 'pap_subtopic', 'majortopic',
       'subtopic', 'description', 'words'],
      dtype='object')

In [6]:
dem = dem.drop(["filter_democrat"], axis = 1)

### Create Label

In [7]:
dem["party"] = 1
rep["party"] = 0

In [8]:
df = pd.concat([dem, rep])

### Set Features and Build Model

In [9]:
df.head()

Unnamed: 0,year,id,pap_majortopic,pap_subtopic,majortopic,subtopic,description,words,party
4,1992,9334,2,202,2,202,provide for the full range of reproductive cho...,20,1
5,1992,9404,2,202,2,202,Democrats stand behind the right of every woma...,29,1
6,1992,9406,2,202,2,202,The goal of our nation must be to make abortio...,18,1
7,1996,10069,2,202,2,202,The Democratic Party stands behind the right o...,23,1
8,1996,10075,2,202,2,202,Our goal is to make abortion less necessary an...,17,1


In [10]:
X = df["description"]
y = df["party"]

### Create TFIDF Vectorizer Logistic Regression Model

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [12]:
tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)

In [13]:
lr = LogisticRegression()
lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
lr.score(X_train, y_train)

0.8488147633387383

In [15]:
lr.score(X_test, y_test)

0.770958777211672

In [16]:
et = ExtraTreeClassifier()
et.fit(X_train, y_train)

ExtraTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                    max_features='auto', max_leaf_nodes=None,
                    min_impurity_decrease=0.0, min_impurity_split=None,
                    min_samples_leaf=1, min_samples_split=2,
                    min_weight_fraction_leaf=0.0, random_state=None,
                    splitter='random')

In [17]:
et.score(X_train, y_train)

0.9993822870820787

In [18]:
et.score(X_test, y_test)

0.6294580824455767

In [19]:
rf = RandomForestClassifier(max_depth=150, max_features=500)
rf.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=150, max_features=500, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [20]:
rf.score(X_train, y_train)

0.9827812524129411

In [21]:
rf.score(X_test, y_test)

0.7195460861509958

In [22]:
params = {
    "max_depth" : [2000],
    "min_samples_split": [10, 15, 20]
}

In [23]:
grid = GridSearchCV(rf, params)

In [24]:
grid.fit(X_train, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=150,
                                              max_features=500,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=10, n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid='w

In [25]:
grid.best_params_

{'max_depth': 2000, 'min_samples_split': 10}