In [1]:
import cudf
import cuml
from cuml.preprocessing.LabelEncoder import LabelEncoder

In [183]:
df = cudf.read_csv('../input/instagramclassificationproblem/dev.csv')

In [184]:
df = df.iloc[:,:-1]
df

In [107]:
df.isna().sum()

### Hence no NaN values in the dataset

## Outlier detection

In [98]:
from sklearn.ensemble import IsolationForest
clf=IsolationForest(n_estimators=100, max_samples='auto', contamination=float(.05), \
                        max_features=1.0, bootstrap=False, n_jobs=-1, random_state=42, verbose=0)
clf.fit(df.to_pandas().iloc[:,:9])
pred = clf.predict(df.to_pandas().iloc[:,:9])
df['anomaly']=pred
outliers=df.loc[df['anomaly']==-1]
outlier_index=list(outliers.to_pandas().index)
#print(outlier_index)
#Find the number of anomalies and normal points here points classified -1 are anomalous
print(df['anomaly'].value_counts())

### Visualising

In [100]:
import matplotlib.pyplot as plt
from cuml.decomposition import PCA
from cuml.preprocessing import StandardScaler
from mpl_toolkits.mplot3d import Axes3D
pca = PCA(n_components=3)  # Reduce to k=3 dimensions
scaler = StandardScaler()
#normalize the metrics
X = scaler.fit_transform(df.iloc[:,:9])
X_reduce = pca.fit_transform(X)

In [101]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.set_zlabel("x_composite_3")
# Plot the compressed data points
ax.scatter(X_reduce.to_pandas().iloc[:, 0], X_reduce.to_pandas().iloc[:, 1], zs=X_reduce.to_pandas().iloc[:, 2], s=4, lw=1, label="inliers",c="green")
# Plot x's for the ground truth outliers
ax.scatter(X_reduce.to_pandas().iloc[outlier_index,0],X_reduce.to_pandas().iloc[outlier_index,1], X_reduce.to_pandas().iloc[outlier_index,2],
           lw=2, s=60, marker="x", c="red", label="outliers")
ax.legend()
plt.show()

In [76]:
import pandas as pd
import numpy as np

In [93]:
from sklearn.decomposition import PCA
pca = PCA(2)
pca.fit(df.to_pandas().iloc[:,:9])
res=pd.DataFrame(pca.transform(df.to_pandas().iloc[:,:9]))
Z = np.array(res)
plt.title("IsolationForest")
plt.contourf( Z, cmap=plt.cm.Blues_r)
b1 = plt.scatter(res[0], res[1], c='green',
                 s=20,label="normal points")
b1 =plt.scatter(res.iloc[outlier_index,0],res.iloc[outlier_index,1], c='green',s=20,  edgecolor="red",label="predicted outliers")
plt.legend(loc="upper right")
plt.show()

Note to self: These outlier detections appear to inacurate probably due to some issue in the data

## Using One hot encoding

In [185]:
le_target = LabelEncoder()
df.has_new_comments = le_target.fit_transform(df.has_new_comments)
encoded_df = cudf.get_dummies(df, columns=['day_posted', 'current_day'])

In [186]:
encoded_df.iloc[:,5:]

In [187]:
X = encoded_df.drop('has_new_comments', axis=1)
y = encoded_df['has_new_comments']

## Scaling

In [188]:
from cuml.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [189]:
X = scaler.fit_transform(X)
X

In [8]:
X.iloc[:,5:]

# Splitting

In [190]:
from cuml.model_selection import train_test_split

In [191]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

# Model

## Hyper Parameter Tuning

In [192]:
from catboost import CatBoostClassifier

In [40]:
tuning_model = CatBoostClassifier(task_type="GPU",
                                   devices='0:1',
                                 loss_function='CrossEntropy')
grid = {'iterations':[100, 300, 500, 1000],
        'learning_rate': [0.1,0.5,1],
        'max_depth': [2,3,5],
        'l2_leaf_reg': [1,5,7],}

grid_search_result = tuning_model.grid_search(grid,
                                            X=X_train.to_pandas(),
                                            y=y_train.to_pandas(),
                                            plot=True)

In [51]:
grid_search_result['params']

## Model Training

In [193]:
model = CatBoostClassifier(iterations=500,
                          learning_rate=0.1,
                          depth=5,
                           verbose=True,
                           loss_function='CrossEntropy',
                           eval_metric="AUC",
                          )

model.fit(
    X_train.to_pandas(), y_train.to_pandas(),
    eval_set=(X_val.to_pandas(), y_val.to_pandas()),
    plot = True
)

In [27]:
model.save_model('insta_classfication',
           format="cbm",
           export_parameters=None,
           pool=None)

In [28]:
from_file = CatBoostClassifier()

from_file.load_model("./insta_classfication")

# Test

In [194]:
df_test = cudf.read_csv('../input/instagramclassificationproblem/comp.csv')
df_test = df_test.iloc[:,:-2]

In [195]:
encoded_df_test = cudf.get_dummies(df_test, columns=['day_posted', 'current_day'])

In [196]:
X_test = scaler.fit_transform(encoded_df_test)

In [206]:
model.predict(X_test.to_pandas())

In [198]:
def yes_no(item):
    return 'yes' if item == 1 else 'no'

In [199]:
df_test['yes_no'] = model.predict(X_test.to_pandas())
df_test

In [200]:
df_test['yes_no'].to_pandas().apply(yes_no)

In [201]:
df_output = cudf.read_csv('../input/instagramclassificationproblem/comp.csv')
df_output['ID']

In [202]:
df_output = df_output[['ID']]
df_output

In [203]:
df_output['has_new_comments'] = df_test['yes_no'].to_pandas().apply(yes_no)
df_output

In [204]:
df_output.to_csv('submission3.csv', index=False)

# Trying with LabelEncoding days of week [Results of One Hot are above]

In [51]:
le = LabelEncoder()
df.day_posted = le.fit_transform(df.day_posted)
df.current_day = le.fit_transform(df.current_day)
# encoded_df = cudf.get_dummies(df, columns=['day_posted', 'current_day'])

In [54]:
X = df.drop('has_new_comments', axis=1)
y = df['has_new_comments']

In [57]:
X = scaler.fit_transform(X)

In [58]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

In [142]:
type(y_train)

In [59]:
model = CatBoostClassifier(iterations=500,
                          learning_rate=0.1,
                          depth=5,
                           verbose=True,
                           custom_loss=['AUC', 'Accuracy', 'CrossEntropy', 'Logloss'],
                          )

model.fit(
    X_train.to_pandas(), y_train.to_pandas(),
    eval_set=(X_val.to_pandas(), y_val.to_pandas()),
    plot = True
)

### **Observation**: Label encoding the days of week is reducing the performance

# Training only on important features given by catboost

In [173]:
from catboost import CatBoostClassifier, Pool

In [182]:
df

In [175]:
pool_data = Pool(df.drop('has_new_comments', axis=1).to_pandas(),
                 df['has_new_comments'].to_pandas(),                  
                 cat_features=['day_posted', 'current_day'])
# model.fit(pool_data)

In [170]:
model = CatBoostClassifier(iterations=500,
                          learning_rate=0.1,
                          depth=5,
                           verbose=True,
                           loss_function='CrossEntropy',
                           eval_metric="AUC",
                          )

In [139]:
df['has_new_comments'] = LabelEncoder().fit_transform(df['has_new_comments'].to_pandas())

In [176]:
# model.fit(
#     df.drop('has_new_comments', axis=1).drop('day_posted', axis=1).drop('current_day', axis=1).to_pandas(),df['has_new_comments'].to_pandas(),
#     plot = True
# )
model.fit(pool_data)

In [116]:
import catboost

In [178]:
# important_features=model.get_feature_importance(data=catboost.Pool(df.drop('has_new_comments', axis=1).drop('day_posted', axis=1).drop('current_day', axis=1).to_pandas()),
#                        prettified=True,
#                        thread_count=-1,
#                        verbose=False,)
important_features=model.get_feature_importance(data=pool_data,
                       prettified=True,
                       thread_count=-1,
                       verbose=False,)

In [205]:
important_features

In [179]:
imp_feat = list(important_features['Feature Id'])

In [180]:
imp_feat

In [156]:
X_train, X_val, y_train, y_val = train_test_split(df[imp_feat], df['has_new_comments'], test_size=0.2, random_state=0)
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

In [157]:
model.fit(
    X_train.to_pandas(), y_train.to_pandas(),
    eval_set=(X_val.to_pandas(), y_val.to_pandas()),
    plot = True
)

In [163]:
df_test = cudf.read_csv('../input/instagramclassificationproblem/comp.csv')
X = df_test.iloc[:,:-4]
X

In [164]:
df_test['yes_no'] = model.predict(X.to_pandas())
df_output = df_output[['ID']]
df_output['has_new_comments'] = df_test['yes_no'].to_pandas().apply(yes_no)
df_output

In [167]:
df_output.to_csv('submission2.csv', index=False)