# Imports and Data Loading

In [1]:
import pandas as pd
import numpy as np
import scipy as sc
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score,recall_score,f1_score,matthews_corrcoef
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_svmlight_file
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_classif

In [2]:
ndtv = load_svmlight_file("/kaggle/input/tv-news-channel-commercial-detection-dataset/NDTV.txt")
cnn = load_svmlight_file("/kaggle/input/tv-news-channel-commercial-detection-dataset/CNN.txt")
bbc = load_svmlight_file("/kaggle/input/tv-news-channel-commercial-detection-dataset/BBC.txt")
cnnibn = load_svmlight_file("/kaggle/input/tv-news-channel-commercial-detection-dataset/CNNIBN.txt")
timesnow = load_svmlight_file("/kaggle/input/tv-news-channel-commercial-detection-dataset/TIMESNOW.txt")

In [3]:
x, y = sc.sparse.vstack([ndtv[0], cnn[0], bbc[0], cnnibn[0], timesnow[0]]), np.concatenate([ndtv[1], cnn[1], bbc[1], cnnibn[1], timesnow[1]])

# Exploratory Data Analysis

In [170]:
x.shape

(129685, 4125)

In [171]:
y.shape

(129685,)

In [172]:
type(x)

scipy.sparse.csr.csr_matrix

In [4]:
df = pd.DataFrame.sparse.from_spmatrix(x)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4115,4116,4117,4118,4119,4120,4121,4122,4123,4124
0,29.0,3.821209,1.567568,13.547628,7.242389,0.019883,0.012195,0.067241,0.049107,3406.866211,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.524255,0.866498
1,25.0,3.052969,1.641484,22.334589,15.734018,0.023027,0.010731,0.077,0.045884,3324.158203,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.072369,0.534711
2,82.0,1.601274,1.508805,5.860583,3.301121,0.025948,0.006956,0.082317,0.044845,3771.984131,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.552685,0.918764
3,25.0,4.819368,2.879584,41.382828,24.448074,0.014387,0.007596,0.069875,0.046916,3301.686035,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.117911,0.22321
4,29.0,2.768753,1.797319,13.338054,9.980667,0.011506,0.007269,0.100647,0.067401,3266.021484,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.529581,0.96813


In [5]:
df.dtypes

0       Sparse[float64, 0]
1       Sparse[float64, 0]
2       Sparse[float64, 0]
3       Sparse[float64, 0]
4       Sparse[float64, 0]
               ...        
4120    Sparse[float64, 0]
4121    Sparse[float64, 0]
4122    Sparse[float64, 0]
4123    Sparse[float64, 0]
4124    Sparse[float64, 0]
Length: 4125, dtype: object

In [174]:
df.count()

1        129685
2        129685
3        129685
4        129685
5        129685
6        129685
7        129685
8        129685
9        129685
10       129685
11       129685
12       129685
13       129685
14       129685
15       129685
16       129685
17       129685
label    129685
dtype: int64

In [175]:
df.isnull().sum()

1        0
2        0
3        0
4        0
5        0
6        0
7        0
8        0
9        0
10       0
11       0
12       0
13       0
14       0
15       0
16       0
17       0
label    0
dtype: int64

# Data Preprocessing

In [177]:
df.drop(df.iloc[:, 18:4123], inplace=True, axis=1)

In [6]:
for i in df.columns:
    df[i] = df[i].values.to_dense().astype(np.float64)

In [7]:
df.dtypes

0       float64
1       float64
2       float64
3       float64
4       float64
         ...   
4120    float64
4121    float64
4122    float64
4123    float64
4124    float64
Length: 4125, dtype: object

In [180]:
df['label'] = y.astype(int).tolist()

In [181]:
df['label'].value_counts()

 1    82231
-1    47454
Name: label, dtype: int64

In [182]:
df.loc[df['label'] == -1, 'label'] = 0
df['label'].value_counts()

1    82231
0    47454
Name: label, dtype: int64

# Base model and predictions

In [183]:
x_train, x_test, y_train, y_test = train_test_split(df.drop(['label'], axis='columns'), df['label'], test_size=0.15, random_state=102, stratify=df['label'])
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
accuracy_score(y_test, y_pred)

0.888397676450933

# Exploring different ways of improving accuracy

## 1. Undersampling to reduce label imbalance

In [184]:
df_zero = df.loc[df['label'] == 0,:]
df_one = df.loc[df['label'] == 1,:]
df_zero.shape, df_one.shape

((47454, 21), (82231, 21))

In [185]:
# Undersampling
df_train = pd.concat([df_zero.iloc[0:40000, :], df_one.iloc[0:40000, :]])
df_test = pd.concat([df_zero.iloc[40000:47000, :], df_one.iloc[40000:47000, :]])
x_train, x_test, y_train, y_test = df_train.drop(['label'], axis='columns'), df_test.drop(['label'], axis='columns'), df_train['label'], df_test['label']
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
accuracy_score(y_test, y_pred)

0.8034285714285714

## 2. Oversampling to reduce label imbalancer

In [186]:
df_zero_duplicated = pd.concat([df_zero.iloc[0:37454, :], df_zero.iloc[0:37454, :]])
df_zero_duplicated.shape, df_one.shape

((74908, 21), (82231, 21))

In [187]:
df_train = pd.concat([df_zero_duplicated.iloc[0:74908, :], df_one.iloc[0:74908, :]])
df_test = pd.concat([df_zero.iloc[37454:, :], df_one.iloc[74908:, :]])
x_train, x_test, y_train, y_test = df_train.drop(['label'], axis='columns'), df_test.drop(['label'], axis='columns'), df_train['label'], df_test['label']
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
accuracy_score(y_test, y_pred)

0.8225480574958148

## 3. Feature scaling to reduce outlier influence

In [188]:
df_scaled = df.drop(['label'], axis='columns').copy()
df_scaled.values[:] = StandardScaler().fit_transform(df.drop(['label'], axis='columns'))

In [189]:
x_train, x_test, y_train, y_test = train_test_split(df_scaled, df['label'], test_size=0.15, random_state=102, stratify=df['label'])
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
accuracy_score(y_test, y_pred)

0.8891173597902637

## 4. Feature selection

In [191]:
sum = 0
for i,v in enumerate(clf.feature_importances_):
    sum += v*100
    print('Feature: %0d, Score: %.5f' % (df.columns[i],v*100))
sum

Feature: 0, Score: 2.02040
Feature: 1, Score: 2.90925
Feature: 2, Score: 4.35925
Feature: 3, Score: 6.35667
Feature: 4, Score: 3.23589
Feature: 5, Score: 7.98557
Feature: 6, Score: 18.77749
Feature: 7, Score: 3.47188
Feature: 8, Score: 3.12627
Feature: 9, Score: 2.95773
Feature: 10, Score: 3.16062
Feature: 11, Score: 2.54452
Feature: 12, Score: 7.08111
Feature: 13, Score: 2.53983
Feature: 14, Score: 3.15638
Feature: 15, Score: 12.17621
Feature: 16, Score: 5.45635
Feature: 17, Score: 4.05364
Feature: 4123, Score: 2.35708
Feature: 4124, Score: 2.27384


100.0

In [193]:
imp=pd.DataFrame(mutual_info_classif(df.drop(['label'], axis='columns'),df['label']),index=df_scaled.columns)
imp.columns=['Importance']
imp.sort_values(by='Importance',ascending=False)

Unnamed: 0,Importance
15,0.222137
12,0.214646
10,0.209684
14,0.207894
9,0.198538
11,0.184614
13,0.173773
16,0.173155
2,0.16937
4,0.166899


In [201]:
df_selective_features = df.drop(['label'], axis='columns').copy()
df_selective_features.drop(list(imp[imp['Importance'] < 0.1].index), axis='columns', inplace=True)

In [202]:
x_train, x_test, y_train, y_test = train_test_split(df_selective_features, df['label'], test_size=0.15, random_state=102, stratify=df['label'])
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
accuracy_score(y_test, y_pred)

0.9007865110779828

## 4. Hyperparameter tuning (Danick)

## 5. Finding meaning in the Bag of Words (Joseph)

# Building and evaluating our final model

In [205]:
# THIS IS AN EXAMPLE MODEL, GET THE NEW MODEL FROM ALL THE BOYS AFTER THEY DONE
x_train, x_test, y_train, y_test = train_test_split(df_selective_features, df['label'], test_size=0.15, random_state=102, stratify=df['label'])
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
accuracy_score(y_test, y_pred)

0.9010949467948388

# What makes a good commerial? A model explanation (Danick)