In [None]:
# Load libraries
from sklearn.ensemble import AdaBoostClassifier
from sklearn import datasets
# Import train_test_split function
from sklearn.model_selection import train_test_split
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

In [None]:
#loading the dataset with the only columns which has high feature importance(obtained from LGBM)
import numpy as np, pandas as pd
load = ['HasDetections', 'AvSigVersion', 'Census_OSVersion', 'OsBuildLab']
df= pd.read_csv('../input/microsoft-malware-prediction/train.csv',dtype='category',usecols=load)
df['HasDetections'] = df['HasDetections'].astype('int8')

In [None]:
#REPLACING MISSING VALUES IN EACH ROW 
#since majority of the columns are categorical we are replacing the missing values by mode
for column in df.columns:
    df[column].fillna(df[column].mode()[0], inplace=True)

In [None]:
#since the version numbers in our dataset associated with time we are using external dataset provided by the kaggle
from datetime import datetime, date, timedelta

# AS timestamp
datedictAS = np.load('../input/malware-timestamps/AvSigVersionTimestamps.npy',allow_pickle=True)[()]
df['DateAS'] = df['AvSigVersion'].map(datedictAS)  

# OS timestamp
datedictOS = np.load('../input/malware-timestamps-2/OSVersionTimestamps.npy',allow_pickle=True)[()]
df['DateOS'] = df['Census_OSVersion'].map(datedictOS)  

# BL timestamp
def convert(x):
    try:
        d = datetime.strptime(x.split('.')[4],'%y%m%d-%H%M')
    except:
        d = np.nan
    return d
df['DateBL'] = df['OsBuildLab'].map(convert)
df.head()
df.dropna(inplace=True)
print(len(df))
df.isnull().values.any()

In [None]:
#GOOGLE DATA
data = pd.read_csv('../input/google-safe-browsing-transparency-report-data/data.csv')
data['WeekOf'] = data['WeekOf'].map(lambda x: datetime.strptime(x,'%Y-%m-%d').date())
weekdictAS={}
for x in datedictAS: 
    weekdictAS[x] = (datedictAS[x] - timedelta(days= -7+1+datedictAS[x].weekday())).date()
df['WeekOf'] = df['AvSigVersion'].map(weekdictAS)
df = pd.merge(df, data, on='WeekOf', how='left')
data.sample(5)
df.dropna(inplace=True)
print(len(df))
df.isnull().values.any()

In [None]:
#THREAT DATA
data2 = pd.read_csv('../input/malware-avsigversion-threats/AvSigversion_Threats.csv')
cv = pd.DataFrame(data2.groupby('AvSigVersion')['index'].count()).rename({'index':'ThreatCount'},axis=1)
df = pd.merge(df,cv,on='AvSigVersion',how='left')
df['ThreatCount'].fillna(0,inplace=True)
data2.sample(10)
df.dropna(inplace=True)
print(len(df))
df.isnull().values.any()

In [None]:
#FINAL DATA
del df['DateAS'], df['DateOS'], df['DateBL'], df['WeekOf'] 
del df['AvSigVersion'], df['OsBuildLab'], df['Census_OSVersion']
df.dropna(inplace=True)
print(len(df))
df.isnull().values.any()
df.head()

In [None]:
#splitting the dataset into test and train
X=df
y=X['HasDetections']
del X['HasDetections']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
# Create adaboost classifer object and saving the model
import pickle
abc = AdaBoostClassifier(n_estimators=25,
                         learning_rate=1)
# Train Adaboost Classifer
adaboost_model = abc.fit(X_train, y_train)

Pkl_Filename = "Pickle_RL_Model.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(adaboost_model, file)
print("done")

In [None]:
#Load the Model back from file
with open(Pkl_Filename, 'rb') as file:  
    Pickled_adaboost_Model = pickle.load(file)

Pickled_adaboost_Model

In [None]:
#predicting the values
y_pred = Pickled_adaboost_Model.predict(X_test)

In [None]:
#calculating accuracy
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))