In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
import pandas.core.algorithms as algos
from pandas import Series
import scipy.stats.stats as stats
import re
import traceback
import string
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# General Information about Data Sets

In [2]:
train2= pd.read_csv('cleaned_train.csv')
weather= pd.read_csv('cleaned_weather.csv')
spray2= pd.read_csv('cleaned_spray.csv')
train= train2.drop(['Year','Month','Day'], axis=1)
spray= spray2.drop(['Year','YearMonth','Day','YearWeek','Month'], axis=1)
train['Date']=pd.to_datetime(train['Date'])
spray['Date']=pd.to_datetime(spray['Date'])
weather['Date']=pd.to_datetime(weather['Date'])
weather['YearWeek']= (weather['Year'].astype(str)+weather['Week'].astype(str)).astype('int64')
weather['YearMonth']=(weather['Year'].astype(str)+weather['Month'].astype(str)).astype('int64')
weather.drop(['Week','Day'], axis=1, inplace=True)

In [3]:
print(train.info(),weather.info(),spray.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9693 entries, 0 to 9692
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Date          9693 non-null   datetime64[ns]
 1   Species       9693 non-null   object        
 2   Latitude      9693 non-null   float64       
 3   Longitude     9693 non-null   float64       
 4   NumMosquitos  9693 non-null   int64         
 5   WnvPresent    9693 non-null   int64         
dtypes: datetime64[ns](1), float64(2), int64(2), object(1)
memory usage: 454.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1472 entries, 0 to 1471
Data columns (total 19 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Date         1472 non-null   datetime64[ns]
 1   Tmax         1472 non-null   int64         
 2   Tmin         1472 non-null   int64         
 3   Tavg         1472 non-null   int64         
 4   Depar

In [4]:
weather.head()

Unnamed: 0,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,Year,Month,YearWeek,YearMonth
0,2007-05-01,83,51,67,14.0,51.0,56.5,0.0,2.5,0.0,29.14,29.82,2.2,26.0,9.4,2007,5,200718,20075
1,2007-05-02,59,42,51,-3.0,42.0,47.0,13.5,0.0,0.0,29.41,30.085,13.15,3.0,13.4,2007,5,200718,20075
2,2007-05-03,66,47,57,2.0,40.0,49.0,8.0,0.0,0.0,29.425,30.12,12.3,6.5,12.55,2007,5,200718,20075
3,2007-05-04,72,50,61,4.0,41.5,50.0,4.0,0.0,0.0,29.335,30.045,10.25,7.5,10.6,2007,5,200718,20075
4,2007-05-05,66,53,60,5.0,38.5,49.5,5.0,0.0,0.0,29.43,30.095,11.45,7.0,11.75,2007,5,200718,20075


In [5]:
weather.isna().any().sum()

0

In [6]:
train.head()

Unnamed: 0,Date,Species,Latitude,Longitude,NumMosquitos,WnvPresent
0,2007-05-29,CULEX PIPIENS/RESTUANS,41.95469,-87.800991,1,0
1,2007-05-29,CULEX RESTUANS,41.95469,-87.800991,1,0
2,2007-05-29,CULEX RESTUANS,41.994991,-87.769279,1,0
3,2007-05-29,CULEX PIPIENS/RESTUANS,41.974089,-87.824812,1,0
4,2007-05-29,CULEX RESTUANS,41.974089,-87.824812,4,0


In [7]:
train.isna().any().sum()

0

# Weather & Train Data Sets - EDA and Feature Engineering

According to the researches, if the weather gets too hot and too dry, mosquitoes will not be as active and feeding as they usually are. But once the humidity increases they’re more hungry and biting more.Therefore humidity plays a key role in WVN Presence. We will add the relative humidity as a feature.

In [8]:
# calculation of RELATIVE HUMIDITY
def rel_humidity(df,T, Td,Tw):
    
# Convert the air temperature and dew-point temperature to Celsius.(C=5*(F-32)/9)
    Tc= (5.0*(df[T]-32))/9.0
    Tdc= (5.0*(df[Td]-32))/9.0

#Calculate the saturated vapor pressure with a formula.  
    es=6.11*10.0**(7.5*Tc/(237.7+Tc))

#Find the actual vapor pressure with the same formula.
    e=6.11*10.0**(7.5*Tdc/(237.7+Tdc))
    
#Calculate the relative humidity.    
    df['RelHumidity']= round((e/es)*100)
    return df

In [9]:
rel_humidity(weather,'Tavg','DewPoint', 'WetBulb')

Unnamed: 0,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,Year,Month,YearWeek,YearMonth,RelHumidity
0,2007-05-01,83,51,67,14.0,51.0,56.5,0.0,2.5,0.000,29.140,29.820,2.20,26.0,9.40,2007,5,200718,20075,56.0
1,2007-05-02,59,42,51,-3.0,42.0,47.0,13.5,0.0,0.000,29.410,30.085,13.15,3.0,13.40,2007,5,200718,20075,71.0
2,2007-05-03,66,47,57,2.0,40.0,49.0,8.0,0.0,0.000,29.425,30.120,12.30,6.5,12.55,2007,5,200718,20075,53.0
3,2007-05-04,72,50,61,4.0,41.5,50.0,4.0,0.0,0.000,29.335,30.045,10.25,7.5,10.60,2007,5,200718,20075,49.0
4,2007-05-05,66,53,60,5.0,38.5,49.5,5.0,0.0,0.000,29.430,30.095,11.45,7.0,11.75,2007,5,200718,20075,45.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1467,2014-10-27,78,52,65,16.0,51.5,58.5,0.5,1.0,0.010,28.960,29.665,12.35,19.0,13.25,2014,10,201444,201410,62.0
1468,2014-10-28,67,46,57,10.0,39.0,47.5,8.0,0.0,0.025,29.190,29.850,14.40,26.0,15.10,2014,10,201444,201410,51.0
1469,2014-10-29,49,38,44,-4.0,33.0,41.0,21.0,0.0,0.000,29.390,30.065,9.00,29.0,9.45,2014,10,201444,201410,65.0
1470,2014-10-30,52,34,43,-4.0,34.5,41.0,21.5,0.0,0.000,29.375,30.095,5.50,23.5,6.00,2014,10,201444,201410,72.0


I will calculate time lags by giving higher weights to the most recent observed values.

In [10]:
def ema(df,col, span):
    df[f'{col}_{span}']= round(df[col].ewm(span=span, adjust=True).mean(),2)
    return df.dropna()
ema_list= ['Tmax', 'DewPoint','WetBulb','PrecipTotal', 'StnPressure','SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed','RelHumidity']
span= list(range(1,15))
for i in ema_list:
    for s in span:
        ema(weather,i,s)

In [11]:
weather.head()

Unnamed: 0,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,PrecipTotal,...,RelHumidity_5,RelHumidity_6,RelHumidity_7,RelHumidity_8,RelHumidity_9,RelHumidity_10,RelHumidity_11,RelHumidity_12,RelHumidity_13,RelHumidity_14
0,2007-05-01,83,51,67,14.0,51.0,56.5,0.0,2.5,0.0,...,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0
1,2007-05-02,59,42,51,-3.0,42.0,47.0,13.5,0.0,0.0,...,65.0,64.75,64.57,64.44,64.33,64.25,64.18,64.12,64.08,64.04
2,2007-05-03,66,47,57,2.0,40.0,49.0,8.0,0.0,0.0,...,59.32,59.47,59.57,59.64,59.69,59.73,59.76,59.78,59.8,59.82
3,2007-05-04,72,50,61,4.0,41.5,50.0,4.0,0.0,0.0,...,55.03,55.42,55.7,55.91,56.07,56.19,56.3,56.38,56.45,56.51
4,2007-05-05,66,53,60,5.0,38.5,49.5,5.0,0.0,0.0,...,51.18,51.77,52.19,52.52,52.78,52.98,53.15,53.29,53.41,53.51


In [12]:
weather.isna().any().sum()

0

In [13]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1472 entries, 0 to 1471
Columns: 160 entries, Date to RelHumidity_14
dtypes: datetime64[ns](1), float64(152), int64(7)
memory usage: 1.8 MB


In [14]:
w_train= train.merge(weather, on= ['Date'])
w_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9693 entries, 0 to 9692
Columns: 165 entries, Date to RelHumidity_14
dtypes: datetime64[ns](1), float64(154), int64(9), object(1)
memory usage: 12.3+ MB


**The below chart indicates CULEX RESTUANS species were most affected by temperature increases.**

In [15]:
w_train=pd.get_dummies(w_train,drop_first=True)
#w_train=pd.get_dummies(w_train)
w_train.head()

Unnamed: 0,Date,Latitude,Longitude,NumMosquitos,WnvPresent,Tmax,Tmin,Tavg,Depart,DewPoint,...,RelHumidity_11,RelHumidity_12,RelHumidity_13,RelHumidity_14,Species_CULEX PIPIENS,Species_CULEX PIPIENS/RESTUANS,Species_CULEX RESTUANS,Species_CULEX SALINARIUS,Species_CULEX TARSALIS,Species_CULEX TERRITANS
0,2007-05-29,41.95469,-87.800991,1,0,88,62,75,10.0,58.5,...,54.88,54.61,54.37,54.16,0,1,0,0,0,0
1,2007-05-29,41.95469,-87.800991,1,0,88,62,75,10.0,58.5,...,54.88,54.61,54.37,54.16,0,0,1,0,0,0
2,2007-05-29,41.994991,-87.769279,1,0,88,62,75,10.0,58.5,...,54.88,54.61,54.37,54.16,0,0,1,0,0,0
3,2007-05-29,41.974089,-87.824812,1,0,88,62,75,10.0,58.5,...,54.88,54.61,54.37,54.16,0,1,0,0,0,0
4,2007-05-29,41.974089,-87.824812,4,0,88,62,75,10.0,58.5,...,54.88,54.61,54.37,54.16,0,0,1,0,0,0


In [16]:
w_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9693 entries, 0 to 9692
Columns: 170 entries, Date to Species_CULEX TERRITANS
dtypes: datetime64[ns](1), float64(154), int64(9), uint8(6)
memory usage: 12.3 MB


In [17]:
w_train.isna().any().sum()

0

In [18]:
print('\n','The Percantage of Virus per Each Species  ','\n')
print('CULEX PIPIENS: ', round(w_train[w_train.WnvPresent==1]['Species_CULEX PIPIENS'].sum()/len(w_train[w_train.WnvPresent==1]['Species_CULEX PIPIENS'])*100,2))
print('CULEX RESTUANS: ', round(w_train[w_train.WnvPresent==1]['Species_CULEX RESTUANS'].sum()/len(w_train[w_train.WnvPresent==1]['Species_CULEX RESTUANS'])*100,2))
print('CULEX PIPIENS/RESTUANS: ', round(w_train[w_train.WnvPresent==1]['Species_CULEX PIPIENS/RESTUANS'].sum()/len(w_train[w_train.WnvPresent==1]['Species_CULEX PIPIENS/RESTUANS'])*100),2)


 The Percantage of Virus per Each Species   

CULEX PIPIENS:  41.35
CULEX RESTUANS:  9.74
CULEX PIPIENS/RESTUANS:  49.0 2


**The below Correlation report indicates Relative Humidity is highly correlated with WN Virus, consequently Dew Temperature and Wet Bulb. Also their 7 and 14 days Exponential Mean Averages have higher correlation than the actual observation values. This must be because of the incubation period.**

figure = plt.figure(figsize=(10,40))
sns.heatmap(w_train.corr()[['WnvPresent']].sort_values('WnvPresent',ascending=False),annot=True, cmap='YlGnBu')

**We will check if there is multicollinearity among the features and fix them.**

In [19]:
X = w_train.drop('WnvPresent', axis=1)
y = w_train['WnvPresent']
X_train,X_test, y_train,y_test = train_test_split(X, y,test_size=0.3)
print(X_train.shape, y_train.shape)
print(len(X_train.columns[X_train.isna().any()]))

(6785, 169) (6785,)
0


Weight of Evidence (WOE) helps to transform a continuous independent variable into a set of groups or bins based on similarity of dependent variable distribution i.e. number of events and non-events.
Informtion of Evidence (IV) helps to rank variables on the basis of their importance. It measures predictive power of independent variables

Information value is not an optimal feature (variable) selection method when you are building a classification model other than binary logistic regression (for eg. random forest or SVM) as conditional log odds (which we predict in a logistic regression model) is highly related to the calculation of weight of evidence. In other words, it's designed mainly for binary logistic regression model. Also think this way - Random forest can detect non-linear relationship very well so selecting variables via Information Value and using them in random forest model might not produce the most accurate and robust predictive model.

In [20]:
pd.options.mode.use_inf_as_na = True
max_bin = 20
force_bin = 3

# define a binning function
def mono_bin(Y, X, n = max_bin):
    
    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]
    r = 0
    while np.abs(r) < 1:
        try:
            d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.qcut(notmiss.X, n)})
            d2 = d1.groupby('Bucket', as_index=True)
            r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
            n = n - 1 
        except Exception as e:
            n = n - 1

    if len(d2) == 1:
        n = force_bin         
        bins = algos.quantile(notmiss.X, np.linspace(0, 1, n))
        if len(np.unique(bins)) == 2:
            bins = np.insert(bins, 0, 1)
            bins[1] = bins[1]-(bins[1]/2)
        d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.cut(notmiss.X, np.unique(bins),include_lowest=True)}) 
        d2 = d1.groupby('Bucket', as_index=True)
    
    d3 = pd.DataFrame({},index=[])
    d3["MIN_VALUE"] = d2.min().X
    d3["MAX_VALUE"] = d2.max().X
    d3["COUNT"] = d2.count().Y
    d3["EVENT"] = d2.sum().Y
    d3["NONEVENT"] = d2.count().Y - d2.sum().Y
    d3=d3.reset_index(drop=True)
    
    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)
    
    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]       
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    
    return(d3)

def char_bin(Y, X):
        
    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]    
    df2 = notmiss.groupby('X',as_index=True)
    
    d3 = pd.DataFrame({},index=[])
    d3["COUNT"] = df2.count().Y
    d3["MIN_VALUE"] = df2.sum().Y.index
    d3["MAX_VALUE"] = d3["MIN_VALUE"]
    d3["EVENT"] = df2.sum().Y
    d3["NONEVENT"] = df2.count().Y - df2.sum().Y
    
    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)
    
    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]      
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    d3 = d3.reset_index(drop=True)
    
    return(d3)

def data_vars(df1, target):
    
    stack = traceback.extract_stack()
    filename, lineno, function_name, code = stack[-2]
    vars_name = re.compile(r'\((.*?)\).*$').search(code).groups()[0]
    final = (re.findall(r"[\w']+", vars_name))[-1]
    
    x = df1.dtypes.index
    count = -1
    
    for i in x:
        if i.upper() not in (final.upper()):
            if np.issubdtype(df1[i], np.number) and len(Series.unique(df1[i])) > 2:
                conv = mono_bin(target, df1[i])
                conv["VAR_NAME"] = i
                count = count + 1
            else:
                conv = char_bin(target, df1[i])
                conv["VAR_NAME"] = i            
                count = count + 1
                
            if count == 0:
                iv_df = conv
            else:
                iv_df = iv_df.append(conv,ignore_index=True)
    
    iv = pd.DataFrame({'IV':iv_df.groupby('VAR_NAME').IV.max()})
    iv = iv.reset_index()
    return(iv_df,iv)

In [21]:
final_iv, IV = data_vars(X_train, y_train)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [22]:
final_iv.head()

Unnamed: 0,VAR_NAME,MIN_VALUE,MAX_VALUE,COUNT,EVENT,EVENT_RATE,NONEVENT,NON_EVENT_RATE,DIST_EVENT,DIST_NON_EVENT,WOE,IV
0,Date,2007-05-29 00:00:00,2007-05-29 00:00:00,18,0,0.0,18,1.0,0.0,0.002795,0.0,0.928645
1,Date,2007-06-05 00:00:00,2007-06-05 00:00:00,42,0,0.0,42,1.0,0.0,0.006521,0.0,0.928645
2,Date,2007-06-26 00:00:00,2007-06-26 00:00:00,51,0,0.0,51,1.0,0.0,0.007918,0.0,0.928645
3,Date,2007-06-29 00:00:00,2007-06-29 00:00:00,35,0,0.0,35,1.0,0.0,0.005434,0.0,0.928645
4,Date,2007-07-02 00:00:00,2007-07-02 00:00:00,53,0,0.0,53,1.0,0.0,0.008229,0.0,0.928645


In [23]:
final_iv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 548 entries, 0 to 547
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   VAR_NAME        548 non-null    object 
 1   MIN_VALUE       548 non-null    object 
 2   MAX_VALUE       548 non-null    object 
 3   COUNT           548 non-null    int64  
 4   EVENT           548 non-null    int64  
 5   EVENT_RATE      548 non-null    float64
 6   NONEVENT        548 non-null    int64  
 7   NON_EVENT_RATE  548 non-null    float64
 8   DIST_EVENT      548 non-null    float64
 9   DIST_NON_EVENT  548 non-null    float64
 10  WOE             548 non-null    float64
 11  IV              548 non-null    float64
dtypes: float64(6), int64(3), object(3)
memory usage: 51.5+ KB


In [24]:
final_iv.isna().any().sum()

0

In [25]:
IV.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169 entries, 0 to 168
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   VAR_NAME  169 non-null    object 
 1   IV        169 non-null    float64
dtypes: float64(1), object(1)
memory usage: 2.8+ KB


In [26]:
IV.isna().any().sum()

0

In [27]:
features = list(IV[(IV['IV'] >0.35) & (IV['IV'] <0.5)]['VAR_NAME'])
X2 = X_train[features]
X2.head()

Unnamed: 0,DewPoint_10,DewPoint_11,DewPoint_12,DewPoint_13,DewPoint_14,DewPoint_3,DewPoint_4,DewPoint_6,DewPoint_7,DewPoint_8,DewPoint_9,WetBulb_10,WetBulb_11,WetBulb_12,WetBulb_13,WetBulb_4,WetBulb_5,WetBulb_6,WetBulb_7,WetBulb_9
2640,54.78,55.25,55.67,56.04,56.37,47.1,49.09,51.85,52.81,53.58,54.23,60.27,60.7,61.07,61.4,55.19,56.55,57.62,58.49,59.78
4450,56.75,56.77,56.78,56.78,56.77,55.49,55.88,56.4,56.55,56.65,56.71,62.91,62.89,62.87,62.84,62.42,62.63,62.77,62.86,62.91
6196,68.24,68.02,67.78,67.54,67.31,68.22,68.61,68.85,68.78,68.64,68.46,72.55,72.4,72.24,72.08,72.49,72.72,72.82,72.83,72.68
5375,57.86,57.7,57.52,57.32,57.12,57.0,57.59,58.08,58.12,58.08,57.99,62.65,62.53,62.38,62.21,61.28,62.0,62.42,62.64,62.72
8858,55.69,55.92,56.13,56.32,56.49,52.15,52.94,54.22,54.69,55.09,55.41,53.32,53.64,53.98,54.31,53.54,52.86,52.6,52.61,53.02


In [28]:
X2.isna().any().sum()

0

In [29]:
X2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6785 entries, 2640 to 9271
Data columns (total 20 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   DewPoint_10  6785 non-null   float64
 1   DewPoint_11  6785 non-null   float64
 2   DewPoint_12  6785 non-null   float64
 3   DewPoint_13  6785 non-null   float64
 4   DewPoint_14  6785 non-null   float64
 5   DewPoint_3   6785 non-null   float64
 6   DewPoint_4   6785 non-null   float64
 7   DewPoint_6   6785 non-null   float64
 8   DewPoint_7   6785 non-null   float64
 9   DewPoint_8   6785 non-null   float64
 10  DewPoint_9   6785 non-null   float64
 11  WetBulb_10   6785 non-null   float64
 12  WetBulb_11   6785 non-null   float64
 13  WetBulb_12   6785 non-null   float64
 14  WetBulb_13   6785 non-null   float64
 15  WetBulb_4    6785 non-null   float64
 16  WetBulb_5    6785 non-null   float64
 17  WetBulb_6    6785 non-null   float64
 18  WetBulb_7    6785 non-null   float64
 19  Wet

In [30]:
X1 = X2._get_numeric_data()

In [31]:
X1.isna().any().sum()

0

In [32]:
def iterate_vif(df, vif_threshold=5, max_vif=6):
  count = 0
  while max_vif > vif_threshold:
    count += 1
    print("Iteration # "+str(count))
    vif = pd.DataFrame()
    vif["VIFactor"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    vif["features"] = df.columns
    
    if round((vif['VIFactor'].max()),1) > vif_threshold:
      print('Removing %s with VIF of %f' % (vif[vif['VIFactor'] == vif['VIFactor'].max()]['features'].values[0], round((vif['VIFactor'].max()),1)))
      df = df.drop(vif[vif['VIFactor'] == vif['VIFactor'].max()]['features'].values[0], axis=1)
      max_vif = round((vif['VIFactor'].max()),1)
    else:
        print('Complete')
        return df, round((vif.sort_values('VIFactor')),1)
X1 = X2._get_numeric_data()
final_df, final_vif = iterate_vif(X1)


Iteration # 1
Removing WetBulb_12 with VIF of 627737614.900000
Iteration # 2
Removing WetBulb_10 with VIF of 505942275.100000
Iteration # 3
Removing WetBulb_6 with VIF of 419251306.200000
Iteration # 4
Removing DewPoint_10 with VIF of 401862117.200000
Iteration # 5
Removing DewPoint_13 with VIF of 386968820.200000
Iteration # 6
Removing DewPoint_9 with VIF of 327295176.300000
Iteration # 7
Removing DewPoint_7 with VIF of 287441975.700000
Iteration # 8
Removing WetBulb_9 with VIF of 253882338.100000
Iteration # 9
Removing DewPoint_12 with VIF of 241749050.300000
Iteration # 10
Removing DewPoint_8 with VIF of 114384725.600000
Iteration # 11
Removing WetBulb_7 with VIF of 95741521.100000
Iteration # 12
Removing WetBulb_11 with VIF of 11606515.200000
Iteration # 13
Removing DewPoint_4 with VIF of 2702816.000000
Iteration # 14
Removing DewPoint_11 with VIF of 492136.800000
Iteration # 15
Removing WetBulb_5 with VIF of 303932.500000
Iteration # 16
Removing DewPoint_6 with VIF of 7492.200000


ValueError: zero-size array to reduction operation maximum which has no identity

In [None]:
X_train=final_df
X_train.head()

In [None]:
X_train.info()

In [None]:
X_train.isna().any().sum()

In [None]:
final_vif

In [None]:
final_vif.info()