In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

import numpy as np
import pandas as pd
import seaborn as sns

This dataset contains, for each U.S. designated market area, the proportions who are fans of 7 different sports leagues, and the proportion who voted for Donald Trump in 2016.

In [2]:
fans_raw = pd.read_csv("NFL_fandom_data-google_trends.csv", index_col=0, header=1)
fans_raw

Unnamed: 0_level_0,NFL,NBA,MLB,NHL,NASCAR,CBB,CFB,Trump 2016 Vote%
DMA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Abilene-Sweetwater TX,45%,21%,14%,2%,4%,3%,11%,79.13%
Albany GA,32%,30%,9%,1%,8%,3%,17%,59.12%
Albany-Schenectady-Troy NY,40%,20%,20%,8%,6%,3%,4%,44.11%
Albuquerque-Santa Fe NM,53%,21%,11%,3%,3%,4%,6%,39.58%
Alexandria LA,42%,28%,9%,1%,5%,3%,12%,69.64%
...,...,...,...,...,...,...,...,...
Wilmington NC,40%,20%,12%,5%,8%,9%,7%,55.91%
Yakima-Pasco-Richland-Kennewick WA,52%,19%,11%,3%,3%,4%,7%,55.26%
Youngstown OH,41%,23%,15%,5%,5%,3%,8%,53.17%
Yuma AZ-El Centro CA,47%,22%,18%,4%,3%,2%,4%,37.48%


To make this frame useful for numerical work, we strip out the percent sign character `%` from all the columns and convert them to `float` type.

In [3]:
remover = lambda x: x.str.strip("%").astype(float)
fans = fans_raw.transform(remover)
fans.head()

Unnamed: 0_level_0,NFL,NBA,MLB,NHL,NASCAR,CBB,CFB,Trump 2016 Vote%
DMA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Abilene-Sweetwater TX,45.0,21.0,14.0,2.0,4.0,3.0,11.0,79.13
Albany GA,32.0,30.0,9.0,1.0,8.0,3.0,17.0,59.12
Albany-Schenectady-Troy NY,40.0,20.0,20.0,8.0,6.0,3.0,4.0,44.11
Albuquerque-Santa Fe NM,53.0,21.0,11.0,3.0,3.0,4.0,6.0,39.58
Alexandria LA,42.0,28.0,9.0,1.0,5.0,3.0,12.0,69.64


## 1.
(3.1) Create a feature frame called `X` from all the columns of `fans` except the last.

In [4]:
X = None

X = fans.iloc[:, :-1]

In [5]:
X.head()

Unnamed: 0_level_0,NFL,NBA,MLB,NHL,NASCAR,CBB,CFB
DMA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Abilene-Sweetwater TX,45.0,21.0,14.0,2.0,4.0,3.0,11.0
Albany GA,32.0,30.0,9.0,1.0,8.0,3.0,17.0
Albany-Schenectady-Troy NY,40.0,20.0,20.0,8.0,6.0,3.0,4.0
Albuquerque-Santa Fe NM,53.0,21.0,11.0,3.0,3.0,4.0,6.0
Alexandria LA,42.0,28.0,9.0,1.0,5.0,3.0,12.0


In [6]:
# TESTS
assert type(X) == pd.DataFrame, "X has the wrong type"
assert X.shape == (207,7), "X has the wrong shape"
assert set(X.columns) == {'CBB', 'CFB', 'MLB', 'NASCAR', 'NBA', 'NFL', 'NHL'}, "X has the wrong columns"
print("OK")


OK


In [None]:
# Intentionally left blank--do not delete

## 2.
(3.1) Make a Boolean label series called `y` that is `True` in each row where Trump got more than half of the vote.

In [10]:
y = None

y = fans['Trump 2016 Vote%'] > 50

In [11]:
y.head()

DMA
Abilene-Sweetwater TX          True
Albany GA                      True
Albany-Schenectady-Troy NY    False
Albuquerque-Santa Fe NM       False
Alexandria LA                  True
Name: Trump 2016 Vote%, dtype: bool

In [12]:
# TESTS
assert type(y) == pd.Series, "y has the wrong type"
assert y.dtype == 'bool', "y has wrong data type"
assert sum(y) == 138, "y has the wrong values"
print("OK")

OK


In [None]:
# Intentionally left blank--do not delete

## 3.
(3.2) Split the dataset, reserving 15% of it as `X_test,y_test` for testing and the rest as `X_train,y_train` for training. 

**IMPORTANT**: Make sure the split order is randomized starting from random state 3383.

In [None]:
X_train, y_train = None, None
X_test, y_test = None, None

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=3383)


In [14]:
X_train.head()

Unnamed: 0_level_0,NFL,NBA,MLB,NHL,NASCAR,CBB,CFB
DMA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Providence RI-New Bedford MA,41.0,24.0,18.0,10.0,3.0,3.0,3.0
Ft. Smith-Fayetteville-Springdale-Rogers AR,35.0,25.0,12.0,3.0,5.0,5.0,16.0
Ottumwa IA-Kirksville MO,37.0,16.0,18.0,4.0,11.0,9.0,6.0
Denver CO,49.0,19.0,13.0,7.0,3.0,3.0,6.0
Erie PA,44.0,16.0,15.0,12.0,6.0,3.0,5.0


In [15]:
# TESTS
assert X_train.shape[0]==175 and X_test.shape[0]==32, "Wrong numbers of rows in the split"
assert all( np.isclose(X_train.iloc[0,:].values,[41,24,18,10,3,3,3]) ), "Wrong rows in the split"
print("OK")

OK


## 4.
(3.3) Train a decision tree called `dtree` of maximum depth 4 on the training dataset. Then compute the accuracy of the classifier on the test set, as the value `dtree_acc`.

**Important!** The decision tree classifier may randomly break ties in its decisions. In order to make your results reproducible, at the classifier creation set its `random_state` equal to 302.

In [16]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Create and train the decision tree classifier with max depth 4 and random_state 302
dtree = DecisionTreeClassifier(max_depth=4, random_state=302)
dtree.fit(X_train, y_train)

# Predict on the test set
y_pred = dtree.predict(X_test)

# Compute accuracy on the test set
dtree_acc = accuracy_score(y_test, y_pred)


In [17]:
print(f"Decision tree accuracy = {dtree_acc:.4%}")

Decision tree accuracy = 68.7500%


In [18]:
# TESTS
assert type(dtree)==DecisionTreeClassifier and dtree.max_depth==4, "Classifier is not set up correctly"
assert 0.6 < dtree_acc < 0.8, "accuracy score is wrong"
print("OK")

OK


In [None]:
# Intentionally left blank--do not delete

## 5.
(3.3) Considering *Trump 2016 Vote%* to be the "positive" case, find the confusion matrix. How many false positives does the classifier have on the test set?

In [None]:
from sklearn.metrics import confusion_matrix

# Compute the confusion matrix
Confusion = confusion_matrix(y_test, y_pred)

# Extract the number of false positives (FP)
FP = Confusion[0, 1]

print("Confusion Matrix:\n", Confusion)
print("False Positives:", FP)

Confusion Matrix:
 [[ 3  7]
 [ 3 19]]
False Positives: 7


In [26]:
print("Confusion matrix:")
print(Confusion)
print(f"There are {FP} false positives on the test set.")

Confusion matrix:
[[ 3  7]
 [ 3 19]]
There are 7 false positives on the test set.


In [31]:
# TESTS
assert Confusion.shape == (2, 2), "Confusion matrix is the wrong type or shape"
assert Confusion.sum() == len(y_pred), "Confusion matrix entries are wrong"
assert 0 <= FP <= len(y_pred), "FP value is wrong"
print("OK")

OK


In [None]:
# Intentionally left blank--do not delete

## 6.
(3.3) Still considering *Trump 2016 Vote%* to be the "positive" case, compute the recall on the test set.

In [32]:
from sklearn.metrics import recall_score

recall = recall_score(y_test, y_pred, pos_label=True)

In [33]:
print(f"Recall was {recall:.2%} on the test set.")

Recall was 86.36% on the test set.


In [34]:
# TESTS
assert 0.8 < recall < 0.9
print("OK")

OK


In [None]:
# Intentionally left blank--do not delete

## 7.
(3.3) Which of the sports leagues (i.e., columns) is most important to the classifier? What fraction of the overall impurity reduction does it account for?

In [None]:
most_important = None
fraction_reduction = None

feature_importances = dtree.feature_importances_

most_important_index = np.argmax(feature_importances)

most_important = X.columns[most_important_index]

fraction_reduction = feature_importances[most_important_index] / feature_importances.sum()

print("Most important feature:", most_important)
print("Fraction of overall impurity reduction:", fraction_reduction)

Most important feature: NASCAR
Fraction of overall impurity reduction: 0.5958104892983503


In [36]:
print(f"{most_important} is most important, accounting for {fraction_reduction:.3%} of the impurity reduction.")

NASCAR is most important, accounting for 59.581% of the impurity reduction.


In [37]:
# TESTS
assert type(most_important) == str, "most_important must be a string"
assert most_important in X.columns, "most_important must be the name of a column"
assert 0.5 < fraction_reduction < 0.75, "wrong value"
print("OK")

OK


In [None]:
# Intentionally left blank--do not delete