In [1]:
#Import all the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno #library to visualize missing data
import re#library for regular expression

from sklearn.preprocessing import MinMaxScaler,Normalizer,StandardScaler,RobustScaler,LabelEncoder, OneHotEncoder 
from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [2]:
#Function to read data to dataframe

def read_file_to_df(filepath):
    
    df_loaded_ind=True
    df = pd.DataFrame()#define an empty dataframe
    
    if filepath.endswith(".csv"):
        df = pd.read_csv(filepath)
        
    elif (filepath.endswith(".xls") | filepath.endswith(".xlsx")):
        df = pd.read_excel(filepath)
        
    else:
        df_loaded_ind=False
        print('Invalid file type')
            
    return df_loaded_ind,df

In [3]:
filepath             = input('Please provide the file path for the data :')
data_loaded_ind,data = read_file_to_df(filepath)
if data_loaded_ind:
    print('Data is successfully loaded!!')
    input('Press enter to take a quick look\n')
    print(data.head())
    input('Press enter to see the description of your data')
    print(data.describe().T)
    input('Press enter to continue ...')
else:
    print('Error loading dataset!!')

Please provide the file path for the data :SFCRIME_train.csv
Data is successfully loaded!!
Press enter to take a quick look

                 Dates        Category                      Descript  \
0  2015-05-13 23:53:00        WARRANTS                WARRANT ARREST   
1  2015-05-13 23:53:00  OTHER OFFENSES      TRAFFIC VIOLATION ARREST   
2  2015-05-13 23:33:00  OTHER OFFENSES      TRAFFIC VIOLATION ARREST   
3  2015-05-13 23:30:00   LARCENY/THEFT  GRAND THEFT FROM LOCKED AUTO   
4  2015-05-13 23:30:00   LARCENY/THEFT  GRAND THEFT FROM LOCKED AUTO   

   DayOfWeek PdDistrict      Resolution                    Address  \
0  Wednesday   NORTHERN  ARREST, BOOKED         OAK ST / LAGUNA ST   
1  Wednesday   NORTHERN  ARREST, BOOKED         OAK ST / LAGUNA ST   
2  Wednesday   NORTHERN  ARREST, BOOKED  VANNESS AV / GREENWICH ST   
3  Wednesday   NORTHERN            NONE   1500 Block of LOMBARD ST   
4  Wednesday       PARK            NONE  100 Block of BRODERICK ST   

            X        

In [4]:
#Function to get the ratio of null values in columns of dataset in descending order
def get_null_count(df):
   
    na_count=df.isnull().sum().sort_values(ascending=False)/len(df)#columnwise ratio of null values
    na_count=na_count[na_count>0]#choose only the columns whic have null values

    return na_count
        

In [5]:
#Function to visualize the null value distribution
def visualize_null_count(df,null_count):

    filtered_data = msno.nullity_filter(df,  filter='bottom',n=len(null_count))
    msno.matrix(filtered_data)
        

In [6]:
if data.isnull().values.any():
    null_count = get_null_count(data)
    print('Below are the features that contain null values:\n', null_count)
    visualize_null_count(data,null_count)
    plt.show()
    input('Press enter to continue')

In [7]:
#Function to drop desired columns
def drop_columns(df,columnlist=[]):
    if columnlist:
        df.drop(columnlist,axis=1,inplace=True)#Drop all the columns in the list
    return df

In [8]:

drop_col_ind = (input('Would you like to drop any feature?-Y/N')).upper()
if drop_col_ind == 'Y':
    drop_null_column_list = []
    print('Enter the features you wish to drop')
    drop_null_column_list = [x for x in input().split()]
    if drop_null_column_list:
        data = drop_columns(data,columnlist=drop_null_column_list)
        print('Specified features are successfully dropped...')
        print('Existing features in the dataset are: ', data.columns.tolist())
        input('Press enter to continue')

Would you like to drop any feature?-Y/Ny
Enter the features you wish to drop
Descript Resolution 
Specified features are successfully dropped...
Existing features in the dataset are:  ['Dates', 'Category', 'DayOfWeek', 'PdDistrict', 'Address', 'X', 'Y']
Press enter to continue


In [9]:
#Function to impute missing values
def impute_missing(df, drop_nans=False):  

    #drop the rows containing null values if drop_nans is set to True
    if drop_nans:
        df.dropna(inplace=True)
    else:
        for column in df.columns.values:
        # Replace NaNs with the median or mode of the column depending on the column type
            try:
                column_median = df[column].median()
                df[column].fillna(column_median, inplace=True)

            except TypeError:
                column_mode = df[column].mode()[0]
                df[column].fillna(column_mode, inplace=True)
                
    if df.isnull().values.any():
        print('Error in replacing/dropping missing data')
    else:
        if drop_nans:
            print('Missing values are successfully dropped')   
        else:
            print('Missing values are successfully replaced')
    
    return df
    

In [10]:
if data.isnull().values.any():
    drop_na_rows = (input('Do you wish to drop all rows with null values?-Y/N')).upper()
    if drop_na_rows == 'Y':
        drop_nans = True

    data = impute_missing(data, drop_nans=False)
    input('Press enter to continue')


In [11]:
#Function to convert datatype of mentioned columns to corresponding mentioned types
def data_type_converter(df,column_names=[],to_types=[]):
    
    if (isinstance(column_names, list) & isinstance(to_types, list)):#check if passed item is in list format

        for col,typ in zip(column_names,to_types):
            df[col] = df[col].astype(typ) 

    return df

In [12]:
print('The data types of every feature are as follows:\n',data.dtypes)
type_change_ind = (input('Would you like to change the data type of any feature?-Y/N')).upper()
if type_change_ind == 'Y':
    cols_for_type_change=[]
    to_type_list=[]
    print('Enter the features for which you wish to change the data types')
    cols_for_type_change = [x for x in input().split()]
    print('Enter the type to which you would like to change the data types')
    to_type_list         = [x for x in input().split()]
    data = data_type_converter(data,column_names=cols_for_type_change,to_types=to_type_list)
    print('The changed data types are as follows:\n', data[cols_for_type_change].dtypes)
    input('Press enter to continue')

The data types of every feature are as follows:
 Dates          object
Category       object
DayOfWeek      object
PdDistrict     object
Address        object
X             float64
Y             float64
dtype: object
Would you like to change the data type of any feature?-Y/NY
Enter the features for which you wish to change the data types
Dates
Enter the type to which you would like to change the data types
datetime64[ns]
The changed data types are as follows:
 Dates    datetime64[ns]
dtype: object
Press enter to continue


In [13]:
#Function to separate data and target variable 
def separate_target(df_raw,target_name):
    
    target             = df_raw[target_name]#target variable 
    df                 = df_raw.drop(target_name,axis=1)#data without target variable
    target_is_separate = True
      
    return df,target,target_is_separate


In [14]:
def separate_cont_and_cat_features(df):
    
    categorical_features = df.select_dtypes(include=['object','bool']).columns.tolist()
    continuous_features  = df.select_dtypes(include=['number']).columns.tolist()
    date_features  = [x for x in df.columns if x not in (categorical_features+continuous_features)]
    
    return continuous_features,categorical_features,date_features
    

In [15]:
#Function to splite timestamp/date column into sub features
def date_splitter(df, fldname):
    
    #convert the specified feature to timestamp format if not already
    fld = df[fldname] 
    if not np.issubdtype(fld.dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, 
                                     infer_datetime_format=True)
        
    targ_pre = re.sub('[Dd]ate$', '', fldname)#substitute the phrase 'date' by spaces in the original field name
    
    #obtain different sub features for the date column
    for n in ('Year', 'Month', 'Week', 'Day', 'Dayofweek', 
            'Dayofyear', 'Is_month_end', 'Is_month_start', 
            'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 
            'Is_year_start'):
        df[targ_pre+n] = getattr(fld.dt,n.lower()).apply(lambda x: int(x))
    
    return df

In [16]:
#Function to convert date field to its Julian format
def Juliandtconv(df,fldname):
    
    #convert the specified feature to timestamp format if not already
    df[fldname]=pd.to_datetime(df[fldname],infer_datetime_format=True)
    
    df[fldname+'Julian'] = df[fldname].apply(lambda x: x.to_julian_date())#convert to julian format
            
    return  df

In [17]:
#Function to concatenate data and target variable 
def concatenate_target(df,target):
    target_is_separate=False
    return (pd.concat([df,target],axis=1)),target_is_separate


In [18]:
target_name    = input('Please enter the target feature')


Please enter the target featureCategory


In [19]:
data,target,target_is_separate_ind                     = separate_target(data,target_name)
continuous_features,categorical_features,date_features = separate_cont_and_cat_features(data)

if date_features:
    print("Transforming date features in the data...")
    print("The date features in the dataset are : ", date_features)
    
    for i in date_features:
        print("Please specify whether you would like to split {} or convert it to julian format".format(i))
        print("Enter S to split, J to convert to Julian")
        split_or_julian_ind=input().upper()
        if split_or_julian_ind == 'S':
            data = date_splitter(data, fldname=i)
        else:
            data = Juliandtconv(data,fldname=i)
            
    date_transformed_data       = drop_columns(data,columnlist=date_features)
    
    data,target_is_separate_ind = concatenate_target(date_transformed_data,target)
    
                
    print('Date tarnsformed data:\n', data.head())
    
    input('Press enter to continue')


Transforming date features in the data...
The date features in the dataset are :  ['Dates']
Please specify whether you would like to split Dates or convert it to julian format
Enter S to split, J to convert to Julian

Date tarnsformed data:
    DayOfWeek PdDistrict                    Address           X          Y  \
0  Wednesday   NORTHERN         OAK ST / LAGUNA ST -122.425892  37.774599   
1  Wednesday   NORTHERN         OAK ST / LAGUNA ST -122.425892  37.774599   
2  Wednesday   NORTHERN  VANNESS AV / GREENWICH ST -122.424363  37.800414   
3  Wednesday   NORTHERN   1500 Block of LOMBARD ST -122.426995  37.800873   
4  Wednesday       PARK  100 Block of BRODERICK ST -122.438738  37.771541   

    DatesJulian        Category  
0  2.457156e+06        WARRANTS  
1  2.457156e+06  OTHER OFFENSES  
2  2.457156e+06  OTHER OFFENSES  
3  2.457156e+06   LARCENY/THEFT  
4  2.457156e+06   LARCENY/THEFT  
Press enter to continue


In [20]:
#Convert categorical features to numerical
def encoding(df,categorical_features):
    
    df_cat = df[categorical_features]
    #By defaault, convert all the categories to numerical forms using Label encoder
    le = {}
    for i in range(len(df_cat.columns.tolist())):
        le[i] = LabelEncoder()
        df_cat.iloc[:,i] = le[i].fit_transform(df_cat.iloc[:,i])

    return df_cat

In [21]:
if target_is_separate_ind == False:
    continuous_features,categorical_features,date_features = separate_cont_and_cat_features(data)

if categorical_features:
    print('Here is the list of categorical features in the dataset:\n', categorical_features)
    print('Encoding all categorical featues as numerical...')
    encoded_data  = encoding(data,categorical_features)
    data          = pd.concat([encoded_data,data[continuous_features+date_features]],axis=1)
    print('Encoded data looks like this: \n',data.head())
    input('Press enter to continue')

Here is the list of categorical features in the dataset:
 ['DayOfWeek', 'PdDistrict', 'Address', 'Category']
Encoding all categorical featues as numerical...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Encoded data looks like this: 
    DayOfWeek  PdDistrict  Address  Category           X          Y  \
0          6           4    19790        37 -122.425892  37.774599   
1          6           4    19790        21 -122.425892  37.774599   
2          6           4    22697        21 -122.424363  37.800414   
3          6           4     4266        16 -122.426995  37.800873   
4          6           5     1843        16 -122.438738  37.771541   

    DatesJulian  
0  2.457156e+06  
1  2.457156e+06  
2  2.457156e+06  
3  2.457156e+06  
4  2.457156e+06  
Press enter to continue


In [22]:
#Function to bring all data into same scale
def scale_data(df,scaling_type):  
    
    if scaling_type == 'MinMax':
        scaler = MinMaxScaler()#scales based on minimum and maximum values in dataset
        
    if scaling_type == 'Standardize':
        scaler = StandardScaler()#scales based on mean and standard deviation of dataset setting mean to 0 and standard deviation to 1
        
    if scaling_type == 'Robustscale':
        scaler = RobustScaler()#scales based on interquartile range of data

    return pd.DataFrame(scaler.fit_transform(df),columns=df.columns)


In [23]:
if not target_is_separate_ind:
    data,target,target_is_separate_ind                     = separate_target(data,target_name)

In [None]:
scaling_method = input('Please choose the scaling method: MinMax/Standardize/Robustscale')

if scaling_method in ['Standardize', 'Robustscale','MinMax']:
    scale      = scaling_method
else:
    print('Invalid scaling method chosen. By default, data is scaled using Min-Max scaler')
    scale      = 'MinMax'

data,target_is_separate_ind       = concatenate_target(scale_data(data,scaling_type= scale),target)

print('Scaled data:\n',data.head())
input('Press enter to continue')

Please choose the scaling method: MinMax/Standardize/Robustscale
Invalid scaling method chosen. By default, data is scaled using Min-Max scaler
Scaled data:
    DayOfWeek  PdDistrict   Address         X         Y  DatesJulian  Category
0        1.0    0.444444  0.852026  0.043578  0.001276     1.000000        37
1        1.0    0.444444  0.852026  0.043578  0.001276     1.000000        21
2        1.0    0.444444  0.977182  0.044337  0.001770     0.999997        21
3        1.0    0.444444  0.183666  0.043030  0.001778     0.999996        16
4        1.0    0.555556  0.079347  0.037198  0.001217     0.999996        16


In [None]:
#Visualize correlation between various features in dataset
def halfHeatMap_corr(df, mirror=False,figsize_x=20, figsize_y=20):

    # Create Correlation df
    corr = df.corr()
    # Plot figsize
    fig, ax = plt.subplots(figsize=(figsize_x, figsize_y))
    # Generate Color Map
    colormap = sns.diverging_palette(220, 10, as_cmap=True)

    if mirror == False:
        # Drop self-correlations
        dropSelf = np.zeros_like(corr)
        dropSelf[np.triu_indices_from(dropSelf)] = True
        
    # Generate heatmap with mask on redundant values with a precision of 2 
    sns.heatmap(corr, cmap=colormap, annot=True, fmt=".2f", mask=dropSelf)

    plt.show()

In [None]:
#Function to drop one of the two correlated features, the correlation coeffiecient is higher than the threshold
def drop_highly_corr_features(df,threshold=0.95):
    # Create correlation matrix
    corr_matrix = df.corr().abs()

    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

    # Find index of feature columns with correlation greater than 0.95
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    
    #drop the column
    df.drop(to_drop,axis =1,inplace=True)

In [None]:
#Function tio obtain a threshold value for outlier removal
def get_outlier_threshold(df):
        
    Q1=df.quantile(0.25)#1st qaurtile 
    Q3=df.quantile(0.75)#3rd qaurtile
    
    IQR=Q3-Q1#Inter quartile range
    #print(Q3+1.5*IQR)
    
    upper_threshold  = (Q3+1.5*IQR).mean()
    lower_threshold  = (Q1-1.5*IQR).mean()
        
    return (upper_threshold,lower_threshold)#return thresholds as a tuple
 

In [None]:
#Function to visualize outliers in the data
def plotoutliers(df,target,collist=[],figsize_x=16,figsize_y=6):
    
    df,trgt=separate_target(df,target)#separate data and target
    
    #If the feature whose outliers are to be visualized, consider only those features else consider all continuous variables
    if collist:
        df=df[collist]
    else:
        df=df._get_numeric_data()

    plt.figure(figsize=(figsize_x, figsize_y))#specify the size of the figure
    ax=sns.boxplot(x="variable", y="value", data=pd.melt(df),showmeans=True)#plot outliers
    ax.set_xticklabels(ax.get_xticklabels(),rotation=45)#rotate the labels so as to adjust space
    
    threshold = get_outlier_threshold(df)#obtain upper and lower threshold values 
   
    ax.axhline(y=threshold[0], color='r',linestyle='--',label='Upper threshold')#plot upper threshold
    ax.axhline(y=threshold[1], color='b',linestyle='--',label='Lower threshold')#plot lower threshold
    ax.legend(loc='upper right')#set legend

    plt.show()
    
    return threshold
    

In [None]:
#Function to drop rows which contain outliers
def remove_outliers(df,column,threshold=(0,0),method='IQR'):
    #Default method is to filter outliers on the basis of interquartile range as we need the data to be normally distributed to filter data based on standard deviation
    
    if method:

        Q1      = df[column].quantile(0.25)#1st qaurtile of the feature
        Q3      = df[column].quantile(0.75)#3rd quartile of the feature
        IQR     = Q3 - Q1#interquartile range

        df = df[~((df[column] < (Q1 - 1.5 * IQR)) |(df[column] > (Q3 + 1.5 * IQR)))]
    
    if method =='threshold':
        
        df = df[~((df[column] < threshold[1]) |(df[column] > threshold[0]))]#remove data which is more than upper threshold and lesser than lower threshold
        
    if method == 'std':
        
        mean = df[column].mean()#mean of the feature
        std  = df[column].std()#standard deviation of the feature
        
        df = df[df[column] < (3*std + mean)]
        
    return df


In [None]:
#Function to plot and obtain feature importance
def feature_imp(df,target,classification=True):
    
    df,trgt=separate_target(df,target)#separate target variable
    
    #Use random forest with 100 estimators to decide important features
    if classification:
        rnd_clf = RandomForestClassifier(n_estimators = 100 , criterion = 'entropy',random_state = 42)
    else:
        rnd_clf = RandomForestRegressor (n_estimators = 100 , criterion = 'entropy',random_state = 42)
    
    rnd_clf.fit(df,trgt)#fit the model
    
    #obtain feature importance along with columns
    x, y = (list(x) for x in zip(*sorted(zip(rnd_clf.feature_importances_, df.columns), reverse = False)))
    
    fi_df=pd.DataFrame(x,index=y,columns=['Importance'])#create dataframe of feature importance
    
    #plot feature importance
    plt.barh(np.arange(len(y)), x, align='center', alpha=0.5,color='g')
    plt.yticks(np.arange(len(y)), y)
    plt.xlabel('Importance')
    
    return fi_df


In [None]:
#Function to obtain the list of important features by referring to feature importance plot and thereby setting threshold
def get_imp_features(df,threshold):
    
    imp_features_df=df[(df['Importance']>threshold)].index.values.tolist()
    
    return imp_features_df
    

In [None]:
#Function to visualize distribution of different levels of categorical variable 
def check_distribution(df,col=None):
    pd.value_counts(df[col]).sort_values(ascending=False).plot(kind="bar")#obtain count of different categories in descending order
    

In [None]:
##Function to reduce the number of features in the data
def dimreduction(df,reqd_dim):
    
    components_required=min(reqd_dim,len(df.columns)) #if number of features < 30, then reduce dimension to to number of features else to 30
    pca = PCA(n_components = components_required)
    principalComponents = pca.fit_transform(df.iloc[:,:].values)#converting data to array so as to fit
    principalDf = pd.DataFrame(data = principalComponents)#convert compressed data to dataframe
    
    return principalDf