# Imports and Data Loading

In [17]:
import pandas as pd
import numpy as np
import scipy as sc
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score,recall_score,f1_score,matthews_corrcoef
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_svmlight_file
from sklearn.preprocessing import StandardScaler

In [18]:
ndtv = load_svmlight_file("D:/data_storage/Projects/ml_commercial_detection/TV_News_Channel_Commercial_Detection_Dataset/NDTV.txt")
cnn = load_svmlight_file("D:/data_storage/Projects/ml_commercial_detection/TV_News_Channel_Commercial_Detection_Dataset/CNN.txt")
bbc = load_svmlight_file("D:/data_storage/Projects/ml_commercial_detection/TV_News_Channel_Commercial_Detection_Dataset/BBC.txt")
cnnibn = load_svmlight_file("D:/data_storage/Projects/ml_commercial_detection/TV_News_Channel_Commercial_Detection_Dataset/CNNIBN.txt")
timesnow = load_svmlight_file("D:/data_storage/Projects/ml_commercial_detection/TV_News_Channel_Commercial_Detection_Dataset/TIMESNOW.txt")

In [19]:
x, y = sc.sparse.vstack([ndtv[0], cnn[0], bbc[0], cnnibn[0], timesnow[0]]), np.concatenate([ndtv[1], cnn[1], bbc[1], cnnibn[1], timesnow[1]])

# Exploratory Data Analysis

In [20]:
x.shape

(129685, 4125)

In [21]:
y.shape

(129685,)

In [22]:
type(x)

scipy.sparse._csr.csr_matrix

# Data Preprocessing

In [23]:
df = pd.DataFrame.sparse.from_spmatrix(x)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4115,4116,4117,4118,4119,4120,4121,4122,4123,4124
0,29.0,3.821209,1.567568,13.547628,7.242389,0.019883,0.012195,0.067241,0.049107,3406.866211,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.524255,0.866498
1,25.0,3.052969,1.641484,22.334589,15.734018,0.023027,0.010731,0.077,0.045884,3324.158203,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.072369,0.534711
2,82.0,1.601274,1.508805,5.860583,3.301121,0.025948,0.006956,0.082317,0.044845,3771.984131,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.552685,0.918764
3,25.0,4.819368,2.879584,41.382828,24.448074,0.014387,0.007596,0.069875,0.046916,3301.686035,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.117911,0.22321
4,29.0,2.768753,1.797319,13.338054,9.980667,0.011506,0.007269,0.100647,0.067401,3266.021484,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.529581,0.96813


In [24]:
df.drop(df.iloc[:, 18:4123], inplace=True, axis=1)

In [25]:
for i in df.columns:
    df[i] = df[i].values.to_dense().astype(np.float64)

In [26]:
df.dtypes

0       float64
1       float64
2       float64
3       float64
4       float64
5       float64
6       float64
7       float64
8       float64
9       float64
10      float64
11      float64
12      float64
13      float64
14      float64
15      float64
16      float64
17      float64
4123    float64
4124    float64
dtype: object

In [27]:
df['label'] = y.astype(int).tolist()

In [28]:
df['label'].value_counts()

 1    82231
-1    47454
Name: label, dtype: int64

In [29]:
df.loc[df['label'] == -1, 'label'] = 0
df['label'].value_counts()

1    82231
0    47454
Name: label, dtype: int64

# Base model and predictions

In [30]:
x_train, x_test, y_train, y_test = train_test_split(df.drop(['label'], axis='columns'), df['label'], test_size=0.15, random_state=102, stratify=df['label'])
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
accuracy_score(y_test, y_pred)

0.8897342312239758

# Exploring different ways of improving accuracy

## 1. Undersampling to reduce label imbalancer

In [31]:
df_zero = df.loc[df['label'] == 0,:]
df_one = df.loc[df['label'] == 1,:]
df_zero.shape, df_one.shape

((47454, 21), (82231, 21))

In [32]:
# Undersampling
df_train = pd.concat([df_zero.iloc[0:40000, :], df_one.iloc[0:40000, :]])
df_test = pd.concat([df_zero.iloc[40000:47000, :], df_one.iloc[40000:47000, :]])
x_train, x_test, y_train, y_test = df_train.drop(['label'], axis='columns'), df_test.drop(['label'], axis='columns'), df_train['label'], df_test['label']
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
accuracy_score(y_test, y_pred)

0.8034285714285714

## 2. Oversampling to reduce label imbalancer

In [33]:
df_zero_duplicated = pd.concat([df_zero.iloc[0:37454, :], df_zero.iloc[0:37454, :]])
df_zero_duplicated.shape, df_one.shape

((74908, 21), (82231, 21))

In [34]:
df_train = pd.concat([df_zero_duplicated.iloc[0:74908, :], df_one.iloc[0:74908, :]])
df_test = pd.concat([df_zero.iloc[37454:, :], df_one.iloc[74908:, :]])
x_train, x_test, y_train, y_test = df_train.drop(['label'], axis='columns'), df_test.drop(['label'], axis='columns'), df_train['label'], df_test['label']
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
accuracy_score(y_test, y_pred)

0.8210471627316285

## 3. Feature scaling to reduce outlier influence

In [35]:
df_scaled = df.drop(['label'], axis='columns').copy()
df_scaled.values[:] = StandardScaler().fit_transform(df.drop(['label'], axis='columns'))

In [36]:
x_train, x_test, y_train, y_test = train_test_split(df_scaled, df['label'], test_size=0.15, random_state=102, stratify=df['label'])
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
accuracy_score(y_test, y_pred)

0.8890145478846451

## 4. Feature selection

## 5. Hyperparametr Tuning

In [37]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
m=DecisionTreeClassifier()

In [38]:
x_data=df.drop(['label'],axis='columns')
y_data=df['label']

In [46]:
search_space={
    'splitter':['best','random'],
    'criterion':['gini','entropy','log_loss'],
    'max_features':['sqrt','log2']
}
gscv=GridSearchCV(estimator=m,param_grid=search_space,verbose=0,cv=4)

In [47]:
gscv.fit(x_data,y_data)

In [48]:
gscv.best_params_

{'criterion': 'entropy', 'max_features': 'log2', 'splitter': 'best'}

In [57]:
m=DecisionTreeClassifier(criterion='entropy', max_features='log2', splitter='best')
search_space={
    'max_depth':[1,4,8,14,20],
    'min_samples_split':[10,50,100,200],
    'min_samples_leaf':[2,6,20,60,150,300],
}
rscv=RandomizedSearchCV(estimator=m,param_distributions=search_space,n_iter=50,verbose=0,cv=4,random_state=42)

In [58]:
rscv.fit(x_data, y_data)

In [60]:
rscv.best_params_

{'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': 20}

In [63]:
search_space={
    'max_depth':[16,18,20,21],
    'min_samples_split':[6,8,10,12,14],
    'min_samples_leaf':[1,2,3,4],
}
gscv=GridSearchCV(estimator=m,param_grid=search_space,verbose=0,cv=4)

In [64]:
gscv.fit(x_data,y_data)

In [65]:
gscv.best_params_

{'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 6}

In [66]:
search_space={
    'max_depth':[19,20,21],
    'min_samples_split':[5,6,7],
    'min_samples_leaf':[1,2],
}
gscv=GridSearchCV(estimator=m,param_grid=search_space,verbose=0,cv=4)

In [67]:
gscv.fit(x_data,y_data)

In [68]:
gscv.best_params_

{'max_depth': 21, 'min_samples_leaf': 2, 'min_samples_split': 6}

In [44]:
best_model=DecisionTreeClassifier(
  criterion='entropy', 
  max_features='log2', 
  splitter='best',
  max_depth=21,
  min_samples_leaf=2,
  min_samples_split=6
)
default_model=DecisionTreeClassifier()

In [40]:
best_model.fit(x_train,y_train)
y_pred = best_model.predict(x_test)
accuracy_score(y_test, y_pred)

0.8638256310080707

Worst performance...That does not look right...

In [42]:
from sklearn.model_selection import cross_val_score

In [49]:
cross_val_score(default_model,x_data,y_data,cv=10).mean()

0.8914143691436358

In [50]:
cross_val_score(best_model,x_data,y_data,cv=10).mean()

0.8680496393454538