In [None]:
# The data is quite big here, and all of it cannot be loaded at once with a simple read_csv call.
# A solution is to specify types, to gain memory (for example switching from float64 to float32)
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float32',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int16',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float64', 
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float32', 
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float32', 
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float64', 
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float64', 
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float32', 
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float32', 
        'Census_InternalPrimaryDisplayResolutionVertical':      'float32', 
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float64', 
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

In [None]:
#reading the dataset
import pandas as pd
train_df1 = pd.read_csv('/kaggle/input/microsoft-malware-prediction/train.csv', dtype=dtypes)
#test_df = pd.read_csv('/kaggle/input/microsoft-malware-prediction/test.csv', dtype=dtypes)

In [None]:
#describing the dataset
train_df1.describe()

In [None]:
#PREPROCESSING

#1) DIMENTIONALITY REDUCTION

#based on missing value ratio
filtered_col = train_df1.columns[train_df1.isnull().mean()>0.5]
print(filtered_col)
print(len(train_df1.columns))
train_df=train_df1.drop(list(filtered_col),axis=1)
print(len(train_df.columns))
#test_df=test_df.drop(list(filtered_col),axis=1)

    


In [None]:
#based on the variance of each coloumn
#An attribute with low variance not really separating data points in anyway.so we can remove that coloumn
l=[]
threshold=0.8
for i in train_df.columns:
    max_rfrequency = train_df[i].value_counts(normalize=True, dropna=False).values[0]#finding maximum value of the relative frequency of values in that column
    if(max_rfrequency>threshold):
        l.append(i)
print(l)
print(len(l))
train_df=train_df.drop(l,axis=1)
print(len(train_df.columns))
#test_df=test_df.drop(l,axis=1)


In [None]:
print(train_df.dtypes) #printing datatype of each column
#figuring out which variables are categorical and which are numerical
#A variable having numeric type doesnot imply that it is quantitative,the numbers could represent categories/levels also
#from inspection of the dataset(nature and domain of the values an attribute can take ) and calculating unique values under each 
#column we can understand that  majority of the variables are  categorical  eventhough their datatype is numerical
#finding unique values of each column
for i in train_df.columns:
    print(i,len(train_df[i].unique()))



    

In [None]:
#REPLACING MISSING VALUES IN EACH ROW 
#since majority of the columns are categorical we are replacing the missing values by mode
for column in train_df.columns:
    train_df[column].fillna(train_df[column].mode()[0], inplace=True)

In [None]:
#since our dataset is very big(around 9 illion rows)we are applying simple random sampling on our dataset
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
train_final = train_df.sample(frac=0.7)
print(len(train_final))

In [None]:
### FREQUENCY ENCODING
from scipy.stats import rankdata
for i in categorical_columns[1:]:
    # size of each category
    encoding = train_df_reduced.groupby(i).size()
    # get frequency of each category
    encoding = encoding/len(train_df_reduced)
    train_df_reduced[i] = train_df_reduced[i].map(encoding)

