In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt      
import math

import sklearn
from sklearn import linear_model, metrics, preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
%matplotlib inline

## In the code below we create a new driver (of Selenium) and with its help we obtain information about all the names of the states in the United States.

In [4]:
options = Options()
options.add_argument("--window-size=1920,1200")

DRIVER_PATH = './chromedriver.exe'
driver = webdriver.Chrome(options = options, executable_path=DRIVER_PATH)
States = []
homePageURL = "https://data.elpasotimes.com/tornado-archive/"
driver.get(homePageURL)
statesElements = driver.find_elements_by_xpath("//*[@id='tornadoState']/option")
for stateElement in statesElements:
    States.append(stateElement.get_attribute("value"))

  driver = webdriver.Chrome(options = options, executable_path=DRIVER_PATH)
  statesElements = driver.find_elements_by_xpath("//*[@id='tornadoState']/option")


## In the code below we use selenium to get all the hurricanes from each state in the years when there were hurricanes

In [51]:
Day = []
State = []
Year = []
FScale = []
Length = []
Width = []
Injury = []
Fatality = []
PropartyDamage = []
date = ""
for state in States:
    years = []
    baseURL = f'https://data.elpasotimes.com/tornado-archive/{state}/'
    driver.get(baseURL)
    yearss = driver.find_elements_by_xpath('//*[@id="tornadoSummary"]/tbody/tr')
    for y in yearss:
        years.append(y.find_element_by_xpath("./td[1]/a").text)
    years.pop(0)
    for year in years:
            baseURL=f'https://data.elpasotimes.com/tornado-archive/{state}/{year}/'
            driver.get(baseURL)
            table = driver.find_elements_by_xpath('//*[@id="YearTornado"]/tbody/tr')
            for item in table:
                date = item.find_element_by_xpath("./td[1]/a").text
                indices = [0,7,9,14]
                parts = [date[i:j] for i,j in zip(indices, indices[1:]+[None])]
                Day.append(parts[0])
                Year.append(parts[2])
                FScale.append(item.find_element_by_xpath("./td[2]").text)
                Length.append(item.find_element_by_xpath("./td[3]").text)
                Width.append(item.find_element_by_xpath("./td[4]").text)
                Injury.append(item.find_element_by_xpath("./td[5]").text)
                Fatality.append(item.find_element_by_xpath("./td[6]").text)
                PropartyDamage.append(item.find_element_by_xpath("./td[7]").text)
                State.append(state)
finalAllStatedDF = pd.DataFrame({"Month-Day":Day,"Year":Year,'F Scale': FScale,'Length(in miles)': Length,'Width(in feet)': Width,'Injury': Injury,'Fatality': Fatality,'Proparty damage': PropartyDamage,"State":State})
driver.close()
finalAllStatedDF.to_csv("finalAllStates.csv",index=False)
finalAllStatedDF

    



  yearss = driver.find_elements_by_xpath('//*[@id="tornadoSummary"]/tbody/tr')
  table = driver.find_elements_by_xpath('//*[@id="YearTornado"]/tbody/tr')


Unnamed: 0,Month-Day,Year,F Scale,Length(in miles),Width(in feet),Injury,Fatality,Proparty damage,State
0,Aug. 31,2021,EF0,1.71,25,0 (0),0 (0),"$10,000",alabama
1,Aug. 31,2021,EF0,0.58,25,0 (0),0 (0),,alabama
2,Aug. 31,2021,EF0,0.65,125,0 (0),0 (0),"$100,000",alabama
3,Aug. 31,2021,EF0,0.89,20,0 (0),0 (0),"$20,000",alabama
4,Aug. 30,2021,EF0,0.48,75,0 (0),0 (0),,alabama
...,...,...,...,...,...,...,...,...,...
71923,May. 28,1953,EF2,38.2,433,0 (0),0 (0),"$2,500",wyoming
71924,May. 08,1952,EF1,2,100,0 (0),0 (0),$30,wyoming
71925,Jun. 14,1950,EF1,.2,10,0 (0),0 (0),$30,wyoming
71926,Jun. 07,1950,EF1,.2,10,0 (0),0 (0),$30,wyoming


## In the code below we start clearing the Data frame (from rows whose value is missing, and also from rows that have duplicates)

In [None]:
beforeCleaningDf = pd.read_csv("finalAllStates.csv")
print("all the rows with there number of None values:\n",beforeCleaningDf.isna().sum())
beforeCleaningDf.dropna(axis = 0, inplace=True)
print("number of duplicated rows:",beforeCleaningDf.duplicated().sum())
beforeCleaningDf.drop_duplicates(inplace=True)

## In the code below we perform a deeper cleaning for each column individually

In [None]:
beforeCleaningDf['Proparty damage'] = beforeCleaningDf['Proparty damage'].str.replace('$','')
beforeCleaningDf['Proparty damage'] = beforeCleaningDf['Proparty damage'].str.replace(',','')
beforeCleaningDf['Proparty damage'] = beforeCleaningDf['Proparty damage'].astype(np.int64)
beforeCleaningDf.rename(columns={'Proparty damage':'Proparty damage (in $)'},inplace=True)

beforeCleaningDf['F Scale'] = beforeCleaningDf['F Scale'].str.replace('EF','')
print("The number of hurricanes in each strength category\n",beforeCleaningDf['F Scale'].value_counts())
beforeCleaningDf.drop(beforeCleaningDf.loc[beforeCleaningDf['F Scale']=='U'].index, inplace=True)
beforeCleaningDf['F Scale'] = beforeCleaningDf['F Scale'].astype(np.int64)
dic = {(0,1):0,(2,3):1,(4,5):2}
beforeCleaningDf.replace({"F Scale":dic},inplace=True)

beforeCleaningDf['Injury'] = beforeCleaningDf['Injury'].astype(str).str.replace(r"\(.*\)","")
beforeCleaningDf['Fatality'] = beforeCleaningDf['Fatality'].astype(str).str.replace(r"\(.*\)","")

beforeCleaningDf['Year'] = beforeCleaningDf['Year'].astype(np.int64)

beforeCleaningDf.to_csv("cleanAllStates.csv",index=False)

## In the code below we implement the EDA (exploratory data analysis)

In [None]:
clean_df = pd.read_csv("cleanAllStates.csv")
dic = {0:"Moderate",1:"Significant",2:"Devestating"}
clean_df.replace({"F Scale":dic},inplace=True)
ct1=pd.crosstab(clean_df['Year'],clean_df['F Scale'])
ct1.plot(kind='bar',figsize=(21,8))

In [None]:
clean_df = pd.read_csv("cleanAllStates.csv")
fig, axes = plt.subplots(figsize=(20,5))
FScaleSeries = clean_df['F Scale'].value_counts()
FScaleSeries = FScaleSeries.rename('')
FScaleSeries.index = ["Moderate Strength[64km/h-180km/h]","Significant Strength[181km/h-331km/h]","Devastating Strength[332km/h-511km/h]"]
FScaleSeries.plot(kind = 'pie', ax = axes,title = 'Strength of the tornado')

In [None]:
groupbyDf = clean_df[['F Scale','Proparty damage (in $)']].groupby('F Scale').sum()/1000000000
groupbyDf.index = ["Moderate Strength","Significant Strength","Devastating Strength"]
ax = groupbyDf.plot.barh(color = 'g')
ax.set(xlabel='Proparty damage in billion of dollars', ylabel='Strength of the tornado')
plt.show()

In [None]:
fscaleChange = clean_df.copy()
order_state_by_latitude = ['hawaii','florida','louisiana','texas','mississippi','alabama'
                          ,'georgia','south-carolina','new-mexico','arizona','arkansas','north-carolina'
                          ,'oklahoma','tennessee','california','kentucky','virginia','missouri','kansas'
                          ,'nevada','maryland','district-of-columbia','washington','west-virginia','colorado'
                          ,'delaware','utah','illinois','indiana','new-jersey','ohio','new-york',
                          'pennsylvania','nebraska','connecticut','rhode-island','iowa','massachusetts'
                          ,'wyoming','vermont','new-hampshire','oregon','south-dakota','wisconsin','michigan'
                          ,'maine','idaho','minnesota','montana','north-dakota']
dic = {0:"Moderate Damage",1:"Significant Damage",2:"Devastating Damage"}
fscaleChange.replace({"F Scale":dic},inplace=True)
ct = pd.crosstab(fscaleChange['State'],fscaleChange['F Scale'])
font = {'weight' : 'bold',
        'size'   : 15}
plt.rc('font', **font)
ax = ct.loc[order_state_by_latitude].plot(kind = 'bar',figsize = (20,7))
ax.set(ylabel='number of the tornadoes')
plt.show()

In [None]:
groupbyDf = clean_df[['State','Injury','Fatality']].groupby('State').sum()
groupbyDf['Injury+Fatality'] = groupbyDf.iloc[:,-2:].sum(axis = 1)
groupbyDf = groupbyDf.reset_index()
bins = [-1,1000,3000,5000,8616]
labels = ["0-1000",'1001-3000','3001-5000','5001-8616']
groupbyDf['Injury+Fatality-Number'] = pd.cut(groupbyDf['Injury+Fatality'], bins = bins,labels=labels)
ct = pd.crosstab(groupbyDf['State'],groupbyDf['Injury+Fatality-Number'])
ax = ct.loc[order_state_by_latitude].plot(kind = 'bar',figsize = (20,10))
plt.show()

## Machine learning (part one) - can the intensity of the tornado be predicted?

In [None]:
def load_dataset(file_name, target_column):
    df = pd.read_csv(file_name)
    X = df.drop(target_column, axis = 1)
    y = pd.Series(df[target_column])

    return X, y


def split_to_train_and_test(X, y, test_ratio, rand_state):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=rand_state)
    return X_train, X_test, y_train, y_test


def scale_features(X_train, scale_type):
    if scale_type == 'standard':
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
    elif scale_type == 'minmax':
        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train)
    
    return scaler, X_train_scaled



def scale_test_features(X_test, scaler):
    X_test_scaled = scaler.transform(X_test)
    return X_test_scaled
    

    
def train_model(X_train, y_train):
    clf_model = LogisticRegression()
    trained_model = clf_model.fit(X_train, y_train)
    return trained_model


def predict_model(trained_model, X_test):
    predicted_vals = trained_model.predict(X_test)
    return predicted_vals


def evaluate_performance(y_test,y_predicted):
    evaluate_value = metrics.f1_score(y_test, y_predicted, average='micro')
    return evaluate_value

X, y = load_dataset("c:/users/oren keinan/cleanAllStates.csv", "F Scale")


X=X.drop({"Month-Day","Year","State"},axis=1)


X_train, X_test, y_train, y_test = split_to_train_and_test(X,y,0.3,41)

standard_scaler, X_train_standard_scaled = scale_features(X_train,"standard")

X_test_scaled = scale_test_features(X_test, standard_scaler)

trained_standard_model = train_model(X_train_standard_scaled, y_train)

predict_vals = predict_model(trained_standard_model ,X_test_scaled)

y_pred = pd.Series(predict_vals,index=X_test.index)

eva = evaluate_performance(y_test,y_pred)

print(metrics.confusion_matrix(y_test, y_pred))

print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred, average='micro'))
print("recall is:",metrics.recall_score(y_test, y_pred, average='micro'))
print("f1 is:",metrics.f1_score(y_test, y_pred, average='micro'))

## Machine learning (part two) - Is it possible by entering the distance and width values to predict the intensity of the tornado? 

In [None]:
df = pd.read_csv('cleanAllStates.csv')
df_FScale_0 = df[df['F Scale']==0].reset_index(drop=True)
df_FScale_1 = df[df['F Scale']==1].reset_index(drop=True)
df_FScale_2 = df[df['F Scale']==2].reset_index(drop=True)
df_FScale_0['TornadoLvl'] = 0
df_FScale_1['TornadoLvl'] = 1
df_FScale_2['TornadoLvl'] = 2
df = pd.concat([df_FScale_0, df_FScale_1,df_FScale_2], ignore_index=True).drop(['F Scale'], axis=1)

X = df[df.columns[(df.columns != 'Month-Day') & (df.columns != 'Year') & (df.columns != 'TornadoLvl') &
                 (df.columns != 'Proparty damage (in $)') & (df.columns != 'State') & 
                 (df.columns != 'Injury') & (df.columns != 'Fatality')]]

y = df['TornadoLvl']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)

scaler = MinMaxScaler(feature_range=(-2,2))

X_train_scaled = scaler.fit_transform(X_train)

model = LogisticRegression().fit(X_train_scaled, y_train)

scaler.fit(X_test)
Length = float(input("Please enter Length(in miles) of the tornado (between 0-500): "))
Width = float(input('Please Width(in feet) of the tornado (between 0-4576): '))
LenWid = scaler.transform([[Length,Width]])
predicted_vals = model.predict_proba(LenWid)[0]
while(0<=Length<=500 and 0<=Width<=4575):
    lowIntensity, medIntensity, highIntensity = predicted_vals[0]*100, predicted_vals[1]*100, predicted_vals[2]*100
    print('Length , Width: ({},{}): Low tornado intensity probability: {:.2f}%\n\t\t\tMedium tornado intensity probability: {:.2f}%\n\t\t\tHigh tornado intensity probability: {:.2f}%\n'.format(Length,Width, lowIntensity,medIntensity, highIntensity))
    Length = float(input("Please enter Length(in miles) of the tornado (between 0-500): "))
    Width = float(input('Please enter  Width(in feet) of the tornado (between 0-4576):'))
    print('\n')
    LenWid = scaler.transform([[Length,Width]])
    predicted_vals = model.predict_proba(LenWid)[0]
print('Input is out of range!')