In [526]:
import pandas as pd
import numpy as np
import sqlite3
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import matplotlib
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
import seaborn as sns

from sklearn import datasets 
from sklearn.feature_selection import RFE 
from sklearn.linear_model import LogisticRegression

from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

from sklearn import preprocessing

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Binarizer

In [527]:
# read fire data
conn = sqlite3.connect('FPA_FOD_20170508.sqlite')
df = pd.read_sql_query("SELECT FIRE_YEAR, STAT_CAUSE_DESCR, LATITUDE, LONGITUDE, STATE, COUNTY, FIPS_NAME, DISCOVERY_DATE, CONT_DATE, FIRE_SIZE, DISCOVERY_DOY, DISCOVERY_TIME FROM 'Fires'", conn)
df['DATE'] = pd.to_datetime(df['DISCOVERY_DATE'] - pd.Timestamp(0).to_julian_date(), unit='D')
df['END_DATE'] = pd.to_datetime(df['CONT_DATE'] - pd.Timestamp(0).to_julian_date(), unit='D')
print(df.head())

FIRE_YEAR STAT_CAUSE_DESCR   LATITUDE   LONGITUDE STATE COUNTY  FIPS_NAME  \
0       2005    Miscellaneous  40.036944 -121.005833    CA     63     Plumas   
1       2004        Lightning  38.933056 -120.404444    CA     61     Placer   
2       2004   Debris Burning  38.984167 -120.735556    CA     17  El Dorado   
3       2004        Lightning  38.559167 -119.913333    CA      3     Alpine   
4       2004        Lightning  38.559167 -119.933056    CA      3     Alpine   

   DISCOVERY_DATE  CONT_DATE  FIRE_SIZE  DISCOVERY_DOY DISCOVERY_TIME  \
0       2453403.5  2453403.5       0.10             33           1300   
1       2453137.5  2453137.5       0.25            133           0845   
2       2453156.5  2453156.5       0.10            152           1921   
3       2453184.5  2453189.5       0.10            180           1600   
4       2453184.5  2453189.5       0.10            180           1600   

        DATE   END_DATE  
0 2005-02-02 2005-02-02  
1 2004-05-12 2004-05-12  
2 200

In [528]:
# Extract and reformat DATETIME for easy lookup for fire data
df['TIME'] = df['DISCOVERY_TIME'].str[0:2]
df['DATETIME'] = df['DATE'].dt.strftime('%Y-%m-%d') + ' ' +  df['TIME']
print(df)

FIRE_YEAR   STAT_CAUSE_DESCR   LATITUDE   LONGITUDE STATE COUNTY  \
0             2005      Miscellaneous  40.036944 -121.005833    CA     63   
1             2004          Lightning  38.933056 -120.404444    CA     61   
2             2004     Debris Burning  38.984167 -120.735556    CA     17   
3             2004          Lightning  38.559167 -119.913333    CA      3   
4             2004          Lightning  38.559167 -119.933056    CA      3   
...            ...                ...        ...         ...   ...    ...   
1880460       2015  Missing/Undefined  40.481637 -122.389375    CA   None   
1880461       2015      Miscellaneous  37.617619 -120.938570    CA   None   
1880462       2015  Missing/Undefined  37.617619 -120.938570    CA   None   
1880463       2015  Missing/Undefined  37.672235 -120.898356    CA   None   
1880464       2015      Miscellaneous  34.263217 -116.830950    CA   None   

         FIPS_NAME  DISCOVERY_DATE  CONT_DATE  FIRE_SIZE  DISCOVERY_DOY  \
0        

In [529]:
# Prune years not in range 2012-2015 for fire data
df = df[df['FIRE_YEAR'] >= 2012]
print(df)

FIRE_YEAR   STAT_CAUSE_DESCR   LATITUDE   LONGITUDE STATE COUNTY  \
1563819       2012           Campfire  45.991944 -113.471389    MT    039   
1563820       2012           Campfire  45.946667 -112.366111    MT    043   
1563821       2012           Campfire  45.418611 -111.855833    MT    057   
1563822       2012           Campfire  46.219167 -112.243333    MT    093   
1563823       2012          Lightning  44.942222 -113.458611    MT    001   
...            ...                ...        ...         ...   ...    ...   
1880460       2015  Missing/Undefined  40.481637 -122.389375    CA   None   
1880461       2015      Miscellaneous  37.617619 -120.938570    CA   None   
1880462       2015  Missing/Undefined  37.617619 -120.938570    CA   None   
1880463       2015  Missing/Undefined  37.672235 -120.898356    CA   None   
1880464       2015      Miscellaneous  34.263217 -116.830950    CA   None   

          FIPS_NAME  DISCOVERY_DATE  CONT_DATE  FIRE_SIZE  DISCOVERY_DOY  \
1563819 

In [530]:
# Read in Weather Data
# df_fire = pd.read_csv('California_Fire_Incidents.csv')
df_humidity = pd.read_csv('humidity.csv')
df_pressure = pd.read_csv('pressure.csv')
df_temp = pd.read_csv('temperature.csv')
df_weather = pd.read_csv('weather_description.csv')
df_wind_direction = pd.read_csv('wind_direction.csv')
df_wind_speed = pd.read_csv('wind_speed.csv')
df_city_attributes = pd.read_csv('city_attributes.csv')

In [531]:
# Prune Irrelevant Cities
cities = df_humidity.columns.drop(['Jerusalem', 'Haifa', 'Eilat', 'Tel Aviv District', 'Beersheba', 'Nahariyya'])
df_humidity = df_humidity[cities]
df_pressure = df_pressure[cities]
df_temp = df_temp[cities]
df_weather = df_weather[cities]
df_wind_direction = df_wind_direction[cities]
df_wind_speed = df_wind_speed[cities]
# df_city_attributes = df_city_attributes[cities]

In [532]:
# Find all cities common between forest fire data and weather data
common_cities = []
for city in cities:
    if(len(df.loc[df['FIPS_NAME']==city]) != 0):
        common_cities.append(city)

In [533]:
# Prune all cities not common between forest fire data and weather data
df = df[df['FIPS_NAME'].isin(common_cities)]
print(df)

FIRE_YEAR   STAT_CAUSE_DESCR   LATITUDE   LONGITUDE STATE  \
1567432       2012      Miscellaneous  34.204167 -117.808333    CA   
1567433       2012          Lightning  34.345278 -117.928889    CA   
1567434       2012      Miscellaneous  34.548056 -118.671667    CA   
1567436       2012      Equipment Use  34.489444 -118.285833    CA   
1567437       2012           Children  34.467778 -118.530833    CA   
...            ...                ...        ...         ...   ...   
1872185       2015  Missing/Undefined  33.387140 -117.174866    CA   
1872229       2015     Debris Burning  33.226976 -117.024311    CA   
1872237       2015  Missing/Undefined  34.666666 -118.766666    CA   
1872244       2015  Missing/Undefined  33.388859 -117.255707    CA   
1872247       2015     Debris Burning  33.243255 -117.241177    CA   

              COUNTY    FIPS_NAME  DISCOVERY_DATE  CONT_DATE  FIRE_SIZE  \
1567432          037  Los Angeles       2456104.5  2456104.5       0.10   
1567433          0

In [534]:
# Combine and Restructure all weather dataframes into one DataFrame and append a city column
columns = ['time', 'humidity', 'pressure', 'temperature', 'weather', 'wind direction', 'wind speed']
df_weatherdata = pd.DataFrame()
for city in common_cities:
    newdf = pd.DataFrame(pd.concat([df_humidity['datetime'], df_humidity[city], df_pressure[city], df_temp[city], df_weather[city], df_wind_direction[city], df_wind_speed[city]], axis=1, keys=columns))
    newdf['city'] = city
    df_weatherdata = df_weatherdata.append(newdf, ignore_index=True)
print(df_weatherdata)

time  humidity  pressure  temperature       weather  \
0       2012-10-01 13:00:00      88.0    1009.0   289.480000    light rain   
1       2012-10-01 14:00:00      87.0    1009.0   289.474993  sky is clear   
2       2012-10-01 15:00:00      86.0    1009.0   289.460618  sky is clear   
3       2012-10-01 16:00:00      85.0    1009.0   289.446243  sky is clear   
4       2012-10-01 17:00:00      84.0    1009.0   289.431869  sky is clear   
...                     ...       ...       ...          ...           ...   
362011  2017-11-29 20:00:00       NaN       NaN          NaN           NaN   
362012  2017-11-29 21:00:00       NaN       NaN          NaN           NaN   
362013  2017-11-29 22:00:00       NaN       NaN          NaN           NaN   
362014  2017-11-29 23:00:00       NaN       NaN          NaN           NaN   
362015  2017-11-30 00:00:00       NaN       NaN          NaN           NaN   

        wind direction  wind speed           city  
0                150.0         2.0

In [535]:
# Combine Date and Time for Easy Lookup
df_weatherdata['DATE'] = df_weatherdata['time'].str[0:11]
df_weatherdata['TIME'] = df_weatherdata['time'].str.split(":").str[0].str[11:]
df_weatherdata['DATETIME'] = df_weatherdata['DATE'] + df_weatherdata['TIME']
print(df_weatherdata['DATETIME'])

0         2012-10-01 13
1         2012-10-01 14
2         2012-10-01 15
3         2012-10-01 16
4         2012-10-01 17
              ...      
362011    2017-11-29 20
362012    2017-11-29 21
362013    2017-11-29 22
362014    2017-11-29 23
362015    2017-11-30 00
Name: DATETIME, Length: 362016, dtype: object


In [536]:
# Prune years that arent between 2012-2015
df_weatherdata = df_weatherdata[~(df_weatherdata['DATE'].astype(str).str.contains('2016') | df_weatherdata['DATE'].astype(str).str.contains('2017'))]
print(df_weatherdata['DATE'])

0         2012-10-01 
1         2012-10-01 
2         2012-10-01 
3         2012-10-01 
4         2012-10-01 
             ...     
345234    2015-12-31 
345235    2015-12-31 
345236    2015-12-31 
345237    2015-12-31 
345238    2015-12-31 
Name: DATE, Length: 227800, dtype: object


In [537]:
# Combine DATETIME and Cities for easy lookup for weather data
df_weatherdata['DATETIME_CITY'] = df_weatherdata['DATETIME'] + ' ' + df_weatherdata['city']
print(df_weatherdata['DATETIME_CITY'])

0         2012-10-01 13 San Francisco
1         2012-10-01 14 San Francisco
2         2012-10-01 15 San Francisco
3         2012-10-01 16 San Francisco
4         2012-10-01 17 San Francisco
                     ...             
345234         2015-12-31 19 New York
345235         2015-12-31 20 New York
345236         2015-12-31 21 New York
345237         2015-12-31 22 New York
345238         2015-12-31 23 New York
Name: DATETIME_CITY, Length: 227800, dtype: object


In [538]:
# Combine DATETIME and Cities for easy lookup for forest fire data
df['DATETIME_CITY'] = df['DATETIME'].astype(str) + ' ' + df['FIPS_NAME']
print(df['DATETIME_CITY'])

1567432    2012-06-26 10 Los Angeles
1567433    2012-09-10 10 Los Angeles
1567434    2012-11-05 13 Los Angeles
1567436    2012-06-28 10 Los Angeles
1567437    2012-06-21 18 Los Angeles
                     ...            
1872185      2015-11-28 15 San Diego
1872229      2015-12-29 17 San Diego
1872237    2015-11-04 14 Los Angeles
1872244      2015-10-13 03 San Diego
1872247      2015-12-09 16 San Diego
Name: DATETIME_CITY, Length: 3722, dtype: object


In [539]:
df_weatherdata['FIRE_DAYS'] = False

In [540]:
# Create a Fire column to track which datetimes in which cities had fires occur
df_weatherdata['FIRE'] = False
df_weatherdata['FIRE'][df_weatherdata.loc[df_weatherdata['DATETIME_CITY'].isin(df['DATETIME_CITY'].values)].index] = True
df_weatherdata = df_weatherdata[df_weatherdata.columns.drop(['time', 'DATE', 'TIME', 'DATETIME_CITY'])]
print(df_weatherdata.loc[df_weatherdata['FIRE']==True])

humidity  pressure  temperature               weather  wind direction  \
14176       69.0    1017.0   294.270000          sky is clear           271.0   
14221        NaN    1014.0   292.180000                  haze           334.0   
20087      100.0    1038.0   282.415000         broken clouds           219.0   
21515       77.0    1030.0   289.181500       overcast clouds           288.0   
24145       87.0    1012.0   289.090000                   fog           290.0   
...          ...       ...          ...                   ...             ...   
344254      45.0    1025.0   282.632909  heavy intensity rain           312.0   
344487      60.0    1030.0   276.642214         broken clouds            25.0   
344558      66.0    1011.0   281.390000      scattered clouds           270.0   
344607      65.0    1039.0   280.294636          sky is clear            15.0   
344995      53.0    1025.0   283.583269       overcast clouds           223.0   

        wind speed           city  

In [551]:
df_weatherdata['HOUR'] = df_weatherdata['DATETIME'].str[11:]
df_weatherdata['DAY'] = df_weatherdata['DATETIME'].str[8:10]
df_weatherdata['MONTH'] = df_weatherdata['DATETIME'].str[5:7]
df_weatherdata['YEAR'] = df_weatherdata['DATETIME'].str[0:4]

# PART 2

In [619]:
X = df_weatherdata[df_weatherdata.columns.drop(['FIRE', 'FIRE_DAYS', 'DATETIME'])]
y = df_weatherdata['FIRE']

In [620]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [621]:
print(X_train)

humidity  pressure  temperature          weather  wind direction  \
149434      77.0    1008.0   292.910000  overcast clouds           170.0   
140854      45.0    1007.0   300.830000       few clouds           140.0   
288700      81.0    1027.0   303.429333     sky is clear           218.0   
316913      50.0    1007.0   297.400000    broken clouds           250.0   
91933       72.0    1023.0   288.200000    broken clouds             0.0   
...          ...       ...          ...              ...             ...   
186987      65.0    1013.0   304.310000     sky is clear            84.0   
154025      55.0    1043.0   284.482000     sky is clear           110.0   
199040      46.0    1031.0   299.098500     sky is clear            77.0   
230752      66.0    1016.0   286.700000  overcast clouds           340.0   
189066      79.0    1010.0   299.130000     sky is clear           203.0   

        wind speed       city HOUR DAY MONTH  YEAR  
149434         6.0     Dallas   11  24    

In [622]:
# creating bool series True for NaN values  
bool_series = pd.isnull(X_train["wind speed"])  
    
# filtering data  
# displaying data only with Gender = NaN  
X_train[bool_series]  

Unnamed: 0,humidity,pressure,temperature,weather,wind direction,wind speed,city,HOUR,DAY,MONTH,YEAR
298441,78.0,1016.0,298.93,mist,,,Miami,14,28,10,2015
298442,78.0,1015.0,298.93505,broken clouds,,,Miami,15,28,10,2015


In [623]:
X_train['humidity'] = X_train['humidity'].fillna(method ='pad')
X_train['pressure'] = X_train['pressure'].fillna(method ='pad')
X_train['temperature'] = X_train['temperature'].fillna(method ='pad')
X_train['wind direction'] = X_train['wind direction'].fillna(method ='pad')
X_train['wind speed'] = X_train['wind speed'].fillna(method ='pad')

In [624]:
X_test['humidity'] = X_test['humidity'].fillna(method ='pad')
X_test['pressure'] = X_test['pressure'].fillna(method ='pad')
X_test['temperature'] = X_test['temperature'].fillna(method ='pad')
X_test['wind direction'] = X_test['wind direction'].fillna(method ='pad')
X_test['wind speed'] = X_test['wind speed'].fillna(method ='pad')

In [625]:
from sklearn import preprocessing

# Encode Labels
encoder = preprocessing.LabelEncoder()
X_train['weather'] = encoder.fit_transform(X_train['weather'])
X_train['city'] = encoder.fit_transform(X_train['city'])

X_test['weather'] = encoder.fit_transform(X_test['weather'])
X_test['city'] = encoder.fit_transform(X_test['city'])

# Encode DATETIME in a way that preserves its cyclical nature
X_train['HOUR_SIN'] = np.sin(X_train['HOUR'].astype(float)*(2.*np.pi/24))
X_train['HOUR_COS'] = np.cos(X_train['HOUR'].astype(float)*(2.*np.pi/24))
X_train['DAY_SIN'] = np.sin(X_train['DAY'].astype(float)*(2.*np.pi/30))
X_train['DAY_COS'] = np.cos(X_train['DAY'].astype(float)*(2.*np.pi/30))
X_train['MONTH_SIN'] = np.sin(X_train['MONTH'].astype(float)*(2.*np.pi/12))
X_train['MONTH_COS'] = np.cos(X_train['MONTH'].astype(float)*(2.*np.pi/12))

X_test['HOUR_SIN'] = np.sin(X_test['HOUR'].astype(float)*(2.*np.pi/24))
X_test['HOUR_COS'] = np.cos(X_test['HOUR'].astype(float)*(2.*np.pi/24))
X_test['DAY_SIN'] = np.sin(X_test['DAY'].astype(float)*(2.*np.pi/30))
X_test['DAY_COS'] = np.cos(X_test['DAY'].astype(float)*(2.*np.pi/30))
X_test['MONTH_SIN'] = np.sin(X_test['MONTH'].astype(float)*(2.*np.pi/12))
X_test['MONTH_COS'] = np.cos(X_test['MONTH'].astype(float)*(2.*np.pi/12))

X_train['YEAR'] = X_train['YEAR'].astype(float)
X_test['YEAR'] = X_test['YEAR'].astype(float)

X_train = X_train[X_train.columns.drop(['HOUR', 'DAY', 'MONTH'])]
X_test = X_test[X_test.columns.drop(['HOUR', 'DAY', 'MONTH'])]

y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

In [626]:
print(X_train)

humidity  pressure  temperature  weather  wind direction  wind speed  \
149434      77.0    1008.0   292.910000       16           170.0         6.0   
140854      45.0    1007.0   300.830000        3           140.0         7.0   
288700      81.0    1027.0   303.429333       23           218.0         3.0   
316913      50.0    1007.0   297.400000        0           250.0         5.0   
91933       72.0    1023.0   288.200000        0             0.0         0.0   
...          ...       ...          ...      ...             ...         ...   
186987      65.0    1013.0   304.310000       23            84.0         0.0   
154025      55.0    1043.0   284.482000       23           110.0         4.0   
199040      46.0    1031.0   299.098500       23            77.0         3.0   
230752      66.0    1016.0   286.700000       16           340.0         4.0   
189066      79.0    1010.0   299.130000       23           203.0         2.0   

        city    YEAR  HOUR_SIN      HOUR_COS   

In [627]:
# Standardize
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [628]:
print(X_train)

[[ 0.33472517 -1.36303928  0.28250634 ...  0.41439979  1.30456295
  -0.78228429]
 [-1.29631557 -1.47527672  1.20351227 ...  1.35560703  0.77804012
  -1.29405949]
 [ 0.53860526  0.76947203  1.50578517 ... -1.30527141 -1.37969058
  -0.08318637]
 ...
 [-1.24534554  1.21842178  1.00215851 ... -0.16729827 -1.18696984
   0.61591156]
 [-0.22594508 -0.46513978 -0.43964604 ...  0.41439979  1.30456295
  -0.78228429]
 [ 0.43666522 -1.13856441  1.0058216  ...  1.26473687 -1.37969058
  -0.08318637]]


In [629]:
from imblearn.over_sampling import SMOTE, ADASYN

# Smote for resampling
X_resampled, y_resampled = SMOTE().fit_sample(X_train, y_train)

In [630]:
pd.crosstab(index=y_resampled, columns='count')

col_0,count
row_0,Unnamed: 1_level_1
0,151397
1,151397


In [631]:
def fiveCVLogisticRegression(xtrain, ytrain, C_grid):
    scores_train = np.zeros(len(C_grid))
    scores_test = np.zeros(len(C_grid))
    for i in range(5):
        print('Run ', i+1)
        kfold = StratifiedKFold(n_splits=5, shuffle=True)
        
        model = LogisticRegression(penalty='l2')

        grid={"C":C_grid, "penalty":["l2"]} # l2 ridge
        logreg=LogisticRegression()
        logreg_cv=GridSearchCV(logreg,grid,cv=kfold,return_train_score=True)
        logreg_cv.fit(xtrain, ytrain)

        # View the accuracy score
        # print('Best score for training data:', logreg_cv.best_score_,"\n") 

        # View the best parameters for the model found using grid search
        print('Best lambda:',1.0/logreg_cv.best_estimator_.C,"\n") 


        # print("Training set score for logreg_cv: ",  logreg_cv.cv_results_['mean_train_score'])
        # print("Testing  set score for logreg_cv: ", logreg_cv.cv_results_['mean_test_score'])

        scores_train += logreg_cv.cv_results_['mean_train_score']
        scores_test += logreg_cv.cv_results_['mean_test_score']

    scores_train = scores_train/5
    scores_test = scores_test/5
   

    print('avg train score: ', scores_train)
    print('avg test score: ', scores_test)
    return scores_train, scores_test

In [632]:
C_grid = [1e-3, 1e-2, 1e-1, 1, 1e1, 50.0, 1e2, 1e3]
scores_train_std, scores_test_std = fiveCVLogisticRegression(X_resampled, y_train, C_grid)

Run  1
Best lambda: 1000.0 

Run  2
Best lambda: 1000.0 

Run  3
Best lambda: 1000.0 

Run  4
Best lambda: 1000.0 

Run  5
Best lambda: 1000.0 

avg train score:  [0.99194764 0.99194764 0.99194764 0.99194764 0.99194764 0.99194764
 0.99194764 0.99194764]
avg test score:  [0.99194764 0.99194764 0.99194764 0.99194764 0.99194764 0.99194764
 0.99194764 0.99194764]


In [633]:
def CVLogisticRegression(xtrain, ytrain, xtest, ytest, _lambda):
    kfold = StratifiedKFold(n_splits=5, shuffle=True)
    model = LogisticRegression(penalty='l2', C=1/_lambda)
    model.fit(xtrain, ytrain)

    train_score = 1 - model.score(xtrain, ytrain)
    test_score = 1 - model.score(xtest, ytest)


    return train_score, test_score

In [637]:
avg_train_error, avg_test_error = CVLogisticRegression(X_train, y_train, X_test, y_test, 1000.0)

In [638]:
table = {'Method': ['std'], 
        #  'lambda': [best_lambda_std, best_lambda_log, best_lambda_bin], 
        #  'Avg Val Error': [avg_val_error_std, avg_val_error_log, avg_val_error_bin], 
         'Train Error': [avg_train_error], 
        'Test Error': [avg_test_error]}
df_table = pd.DataFrame(table)

In [639]:
df_table

Unnamed: 0,Method,Train Error,Test Error
0,std,0.008052,0.007676


In [641]:
len(y_train[])

152626