In [2]:
import sklearn as sk

from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [3]:
import imdb_functions # Includes load_data()
import numpy as np

# The data is IMDB data, change the path to data directory.
X_train_corpus , y_train, X_test_corpus , y_test = imdb_functions.load_imdb(path = "../../Fall_19/aclImdb")

Loading the imdb data
Train Data loaded.
Test Data loaded.


In [4]:
# Try with the configuration below
# Decision Tree, min_df=100

token = r"(?u)\b[\w\'/]+\b"
vectorizer = CountVectorizer(token_pattern=token, 
                             binary=True,
                             ngram_range=(1,1),
                             min_df=100,
                             stop_words=["the","a","of","and","br","to"])
X_train_vector = vectorizer.fit_transform(X_train_corpus)
X_test_vector = vectorizer.transform(X_test_corpus)

In [5]:
X_train_vector.sum(axis=0)

matrix([[ 153,  219, 1418, ...,  320,  255,  123]], dtype=int64)

In [6]:
dt = DecisionTreeClassifier(max_depth=100)

In [7]:
dt.fit(X_train_vector, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=100,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [8]:
vectorizer.get_feature_names()

['0',
 '000',
 '1',
 '1/10',
 '1/2',
 '10',
 '10/10',
 '100',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '1930s',
 "1950's",
 '1950s',
 '1970s',
 '1980s',
 '1st',
 '2',
 '2/10',
 '20',
 '2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '20th',
 '24',
 '25',
 '3',
 '3/10',
 '30',
 '4',
 '4/10',
 '40',
 '45',
 '5',
 '50',
 "50's",
 '6',
 '60',
 "60's",
 '60s',
 '7',
 '7/10',
 "70's",
 '70s',
 '8',
 '8/10',
 '80',
 "80's",
 '80s',
 '9',
 '9/10',
 '90',
 "90's",
 '99',
 'abandoned',
 'abilities',
 'ability',
 'able',
 'about',
 'above',
 'absence',
 'absolute',
 'absolutely',
 'absurd',
 'abuse',
 'academy',
 'accent',
 'accents',
 'accept',
 'acceptable',
 'accepted',
 'accident',
 'accidentally',
 'accomplished',
 'according',
 'account',
 'accurate',
 'accused',
 'achieve',
 'achieved',
 'achievement',
 'across',
 'act',
 'acted',
 'acting',
 'action',
 'actions',
 'actor',
 'actors',
 'actress',
 'actresses',
 'acts',
 'actual',
 'actually',
 'ad',
 'adam',
 

In [9]:
len(dt.feature_importances_), len(dt.feature_importances_.nonzero()[0])

(3893, 1769)

In [10]:
# There is no negative value.
len([x for x in dt.feature_importances_ if x<0])

0

In [11]:
# Find biggest coefficients.
importances = list(zip(vectorizer.get_feature_names(), dt.feature_importances_))
rank_importances = sorted(importances, key = lambda x: x[1], reverse=True)
rank_importances[:150]

[('bad', 0.07511226073225666),
 ('worst', 0.04805627537035211),
 ('great', 0.02973196196457402),
 ('waste', 0.02951770847520941),
 ('awful', 0.02017028375730499),
 ('boring', 0.014454779212375101),
 ('excellent', 0.013927867023521046),
 ('best', 0.010726145209974629),
 ('no', 0.010196428382169396),
 ('wonderful', 0.00956555513836395),
 ('poor', 0.009327625624340495),
 ('nothing', 0.006756844140983485),
 ('stupid', 0.006040285967281376),
 ('terrible', 0.005971253278021418),
 ('poorly', 0.00555440804300522),
 ('love', 0.005273197617853429),
 ('perfect', 0.005061614757653801),
 ('worse', 0.00490916119761634),
 ('ridiculous', 0.004242073357966109),
 ('plot', 0.0038725042869399233),
 ('both', 0.0036190685918804574),
 ('lame', 0.003508898724033206),
 ('dull', 0.0034919107133067045),
 ('beautiful', 0.003472073148081397),
 ('favorite', 0.003033982648537086),
 ('disappointing', 0.0028442373767013657),
 ('supposed', 0.0028015695255107346),
 ('well', 0.002798934743471213),
 ('acting', 0.002747043

In [12]:
feature_names = vectorizer.get_feature_names()

In [13]:
dt.feature_importances_[feature_names.index('bad')]

0.07511226073225666

In [14]:
freqs=np.sum(X_train_vector, axis=0).A1

In [15]:
freqs.shape

(3893,)

In [16]:
freqs[feature_names.index('bad')]

5892

In [17]:
freqs[feature_names.index('worst')]

2264

In [18]:
freqs[feature_names.index('1/10')]

158

--------
## Decision Trees and Feature Importances

Feature importance is calculated as the decrease in node impurity weighted by the probability of reaching that node. The node probability can be calculated by the number of samples that reach the node, divided by the total number of samples. The higher the value the more important the feature.

It uses Gini Impurity, and not really the Information Gain (which takes Entropy as the measure).


```python
importance_data[node.feature] += (
                        node.weighted_n_node_samples * node.impurity -
                        left.weighted_n_node_samples * left.impurity -
                        right.weighted_n_node_samples * right.impurity)
```

__Formula:__

<font size="5"><center>_ni<sub>j</sub> = w<sub>j</sub> c<sub>j</sub> - w<sub>left(j)</sub> c<sub>left(j)</sub> - w<sub>right(j)</sub> c<sub>right(j)</sub>_</center></font>






