In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import time
import datetime

# Load Data
df = pd.read_csv('seattle_weather_1948-2017.csv')

In [2]:
# Baseline model
df["RAIN"].value_counts(normalize=True)[0]

0.5733521214967904

In [3]:
#null
df.isna().sum()

DATE    0
PRCP    3
TMAX    0
TMIN    0
RAIN    3
dtype: int64

In [4]:
# see the varaibles 
df["RAIN"].value_counts()


False    14648
True     10900
Name: RAIN, dtype: int64

In [5]:
#fill na 
df['RAIN'] = df['RAIN'].fillna(False)
df["RAIN"].value_counts()

False    14651
True     10900
Name: RAIN, dtype: int64

In [6]:
df["PRCP"].mean()

0.10622162204477956

In [7]:
df['PRCP'].fillna(df['PRCP'].mean(), inplace=True)
df["PRCP"].mean()

0.10622162204477957

# Modified One

In [8]:
#This code done by (Mikio Harman) , and have been mofified by Ahmed. 
def heuristic(df):
    """
    Simple heuristic:
    
    If it rained the past one day then predict rain else predict no rain
    
    The Frist row are predicted False be default
    """
    
    preds = []
    
    for x in range(len(df)):
        # If first two rows then predict false
        if x<1:
            preds.append(False)
        else:
            # If either of last day == True then predict True
            if (df.iloc[x-1]["RAIN"] == True):
                preds.append(True)
            else:
                preds.append(False)
    return preds
    

In [9]:
#This code done by (Mikio Harman)
df["preds"] = heuristic(df)
df["preds"] 

0        False
1         True
2         True
3         True
4         True
         ...  
25546    False
25547    False
25548    False
25549    False
25550    False
Name: preds, Length: 25551, dtype: bool

In [10]:
#This code done by (Mikio Harman)
# Determine Accuracy

# Create function to to find values

def calc_confuse(df):
    
    "Calculate all possible results of a confusion matrix"

    # Hold all possible values and set to zero
    FP = np.zeros(len(df))
    TP = np.zeros(len(df))
    FN = np.zeros(len(df))
    TN = np.zeros(len(df))
    
    for x in range(len(df)):
        
        # True Positive
        if (df["RAIN"].iloc[x] == True) & (df["preds"].iloc[x] == True):
            TP[x] = 1
        # True Negative
        elif (df["RAIN"].iloc[x] == False) & (df["preds"].iloc[x] == False):
            TN[x] = 1
        # False Negative
        elif (df["RAIN"].iloc[x] == True) & (df["preds"].iloc[x] == False):
            FN[x] = 1
        # False Positive
        else:
            FP[x] = 1
    
    return FP, TP, FN, TN

In [11]:
#This code done by (Mikio Harman)
# Extract results and create columns for each
w,x,y,z = calc_confuse(df)

df["FP"] = w
df["TP"] = x
df["FN"] = y
df["TN"] = z

# Look at 10 random rows to determin accuracy
df.sample(10)

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN,preds,FP,TP,FN,TN
18184,1997-10-14,0.15,57,49,True,False,0.0,0.0,1.0,0.0
11652,1979-11-26,0.14,40,32,True,True,0.0,1.0,0.0,0.0
2664,1955-04-18,0.0,53,36,False,True,1.0,0.0,0.0,0.0
14618,1988-01-09,0.35,47,36,True,True,0.0,1.0,0.0,0.0
19065,2000-03-13,0.46,58,42,True,False,0.0,0.0,1.0,0.0
312,1948-11-08,0.0,46,27,False,False,0.0,0.0,0.0,1.0
24716,2015-09-02,0.0,67,52,False,True,1.0,0.0,0.0,0.0
4832,1961-03-25,0.01,55,43,True,True,0.0,1.0,0.0,0.0
831,1950-04-11,0.3,58,43,True,False,0.0,0.0,1.0,0.0
21471,2006-10-14,0.04,51,47,True,False,0.0,0.0,1.0,0.0


In [12]:
(sum(df["TP"]) + sum(df["TN"]))/ len(df)

0.7112441783100466

# First One 

In [13]:
##### making a copy
df_copy = df.copy()

In [14]:
# extract year,month,day
# extract the year and put it in new column 
df_copy['year'] = pd.DatetimeIndex(df_copy['DATE']).year
df_copy['month'] = pd.DatetimeIndex(df_copy['DATE']).month
df_copy['day'] = pd.DatetimeIndex(df_copy['DATE']).day

#deleting the five columns 
del df_copy['FP']
del df_copy['TP']
del df_copy['FN']
del df_copy['TN']
df_copy

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN,preds,year,month,day
0,1948-01-01,0.47,51,42,True,False,1948,1,1
1,1948-01-02,0.59,45,36,True,True,1948,1,2
2,1948-01-03,0.42,45,35,True,True,1948,1,3
3,1948-01-04,0.31,45,34,True,True,1948,1,4
4,1948-01-05,0.17,45,32,True,True,1948,1,5
...,...,...,...,...,...,...,...,...,...
25546,2017-12-10,0.00,49,34,False,False,2017,12,10
25547,2017-12-11,0.00,49,29,False,False,2017,12,11
25548,2017-12-12,0.00,46,32,False,False,2017,12,12
25549,2017-12-13,0.00,48,34,False,False,2017,12,13


In [15]:
# EDA , check each month with the value of raining , so :
# on the first month True is the most value 
# on the fourth month False is the most value , etc 
i = 1
while i <= 12:
    print(i)
    new = df_copy[(df_copy["month"] == i)]
    print(new["RAIN"].value_counts())
    i=i+1

1
True     1298
False     872
Name: RAIN, dtype: int64
2
True     1103
False     875
Name: RAIN, dtype: int64
3
True     1212
False     958
Name: RAIN, dtype: int64
4
False    1102
True      998
Name: RAIN, dtype: int64
5
False    1399
True      771
Name: RAIN, dtype: int64
6
False    1468
True      632
Name: RAIN, dtype: int64
7
False    1827
True      343
Name: RAIN, dtype: int64
8
False    1757
True      413
Name: RAIN, dtype: int64
9
False    1491
True      609
Name: RAIN, dtype: int64
10
False    1220
True      950
Name: RAIN, dtype: int64
11
True     1264
False     836
Name: RAIN, dtype: int64
12
True     1307
False     846
Name: RAIN, dtype: int64


In [16]:
##### The second attempt
def heuristic_two(df_date):
    """
    Simple heuristic:
    
    it takes per each month and insert the prediction values 
    """
#     df_date.iloc[12]["RAIN"]
    preds = []
    
    for x in range(len(df_date)):
              
        if (df_date.iloc[x]["month"] == 1):
            preds.append(True)
        elif df_date.iloc[x]["month"] == 2:
            preds.append(True)
        elif df_date.iloc[x]["month"] == 3:
            preds.append(True)
        elif df_date.iloc[x]["month"] == 4:
            preds.append(False )
        elif df_date.iloc[x]["month"] == 5:
            preds.append(False )
        elif df_date.iloc[x]["month"] == 6:
            preds.append(False )
        elif df_date.iloc[x]["month"] == 7:
            preds.append(False )
        elif df_date.iloc[x]["month"] == 8:
            preds.append(False )
        elif df_date.iloc[x]["month"] == 9:
            preds.append(False )
        elif df_date.iloc[x]["month"] == 10:
            preds.append(False )
        elif df_date.iloc[x]["month"] == 11:
            preds.append(True)
        elif df_date.iloc[x]["month"] == 12:
            preds.append(True)
            
    return preds

In [17]:
df_copy["preds"] = heuristic_two(df_copy)
df_copy["preds"] 

0        True
1        True
2        True
3        True
4        True
         ... 
25546    True
25547    True
25548    True
25549    True
25550    True
Name: preds, Length: 25551, dtype: bool

In [18]:
#This code done by (Mikio Harman)
# Extract results and create columns for each
w,x,y,z = calc_confuse(df_copy)

df_copy["FP"] = w
df_copy["TP"] = x
df_copy["FN"] = y
df_copy["TN"] = z

# Look at 10 random rows to determin accuracy
df_copy.sample(10)

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN,preds,year,month,day,FP,TP,FN,TN
22705,2010-03-01,0.0,57,42,False,True,2010,3,1,1.0,0.0,0.0,0.0
9715,1974-08-07,0.0,68,49,False,False,1974,8,7,0.0,0.0,0.0,1.0
20769,2004-11-11,0.0,50,38,False,True,2004,11,11,1.0,0.0,0.0,0.0
4750,1961-01-02,0.0,41,25,False,True,1961,1,2,1.0,0.0,0.0,0.0
9301,1973-06-19,0.15,71,55,True,False,1973,6,19,0.0,0.0,1.0,0.0
7173,1967-08-22,0.0,83,61,False,False,1967,8,22,0.0,0.0,0.0,1.0
5313,1962-07-19,0.0,73,48,False,False,1962,7,19,0.0,0.0,0.0,1.0
6954,1967-01-15,0.24,50,42,True,True,1967,1,15,0.0,1.0,0.0,0.0
5987,1964-05-23,0.04,65,40,True,False,1964,5,23,0.0,0.0,1.0,0.0
8010,1969-12-06,0.0,49,42,False,True,1969,12,6,1.0,0.0,0.0,0.0


In [19]:
(sum(df_copy["TP"]) + sum(df_copy["TN"]))/ len(df_copy)

0.6437321435560251

# Second One 

In [20]:
# Another algorithm 
#deleting the four columns 
del df_copy['FP']
del df_copy['TP']
del df_copy['FN']
del df_copy['TN']
df_copy


Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN,preds,year,month,day
0,1948-01-01,0.47,51,42,True,True,1948,1,1
1,1948-01-02,0.59,45,36,True,True,1948,1,2
2,1948-01-03,0.42,45,35,True,True,1948,1,3
3,1948-01-04,0.31,45,34,True,True,1948,1,4
4,1948-01-05,0.17,45,32,True,True,1948,1,5
...,...,...,...,...,...,...,...,...,...
25546,2017-12-10,0.00,49,34,False,True,2017,12,10
25547,2017-12-11,0.00,49,29,False,True,2017,12,11
25548,2017-12-12,0.00,46,32,False,True,2017,12,12
25549,2017-12-13,0.00,48,34,False,True,2017,12,13


In [21]:
#take the first year and take the values to compare it later 
df_first_year = df_copy[df_copy["year"] == df_copy["year"].min()]
df_first_year

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN,preds,year,month,day
0,1948-01-01,0.47,51,42,True,True,1948,1,1
1,1948-01-02,0.59,45,36,True,True,1948,1,2
2,1948-01-03,0.42,45,35,True,True,1948,1,3
3,1948-01-04,0.31,45,34,True,True,1948,1,4
4,1948-01-05,0.17,45,32,True,True,1948,1,5
...,...,...,...,...,...,...,...,...,...
361,1948-12-27,0.00,34,21,False,True,1948,12,27
362,1948-12-28,0.33,45,31,True,True,1948,12,28
363,1948-12-29,0.29,43,34,True,True,1948,12,29
364,1948-12-30,0.46,44,35,True,True,1948,12,30


In [22]:
# delete from the copy_df the same year and start to pridict others on the day and the month 
df_copy = df_copy[df_copy.year != df_copy.year.min()]
df_copy

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN,preds,year,month,day
366,1949-01-01,0.00,35,26,False,True,1949,1,1
367,1949-01-02,0.03,40,22,True,True,1949,1,2
368,1949-01-03,0.00,40,17,False,True,1949,1,3
369,1949-01-04,0.00,42,20,False,True,1949,1,4
370,1949-01-05,0.00,41,28,False,True,1949,1,5
...,...,...,...,...,...,...,...,...,...
25546,2017-12-10,0.00,49,34,False,True,2017,12,10
25547,2017-12-11,0.00,49,29,False,True,2017,12,11
25548,2017-12-12,0.00,46,32,False,True,2017,12,12
25549,2017-12-13,0.00,48,34,False,True,2017,12,13


In [23]:
new_df_copy = df_copy.reset_index(drop=True)
new_df_copy

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN,preds,year,month,day
0,1949-01-01,0.00,35,26,False,True,1949,1,1
1,1949-01-02,0.03,40,22,True,True,1949,1,2
2,1949-01-03,0.00,40,17,False,True,1949,1,3
3,1949-01-04,0.00,42,20,False,True,1949,1,4
4,1949-01-05,0.00,41,28,False,True,1949,1,5
...,...,...,...,...,...,...,...,...,...
25180,2017-12-10,0.00,49,34,False,True,2017,12,10
25181,2017-12-11,0.00,49,29,False,True,2017,12,11
25182,2017-12-12,0.00,46,32,False,True,2017,12,12
25183,2017-12-13,0.00,48,34,False,True,2017,12,13


In [24]:
##### here for the third attempt
def heuristic_three(df_date):
    """
    Simple heuristic:
    
    predicit the raining day by taking the month and the day in 1948 which is the minimum year value of the dataset
    and assign the raning boolean to the new list preds
    """
#     
    preds = []
       
    for x in range(len(df_date)):
        #take the month and day and save them 
        month = df_date.iloc[x]["month"]
        day = df_date.iloc[x]["day"]
        # take the rain value for month and day in 1948 and save it as prediction
        preds_bool = df_first_year.loc[ (df_first_year['month'] == month) & (df_first_year['day'] == day)]["RAIN"].bool()
        preds.append(preds_bool)
        
        
            
    return preds

In [25]:
new_df_copy["preds"] = heuristic_three(new_df_copy)
new_df_copy["preds"]

0        True
1        True
2        True
3        True
4        True
         ... 
25180    True
25181    True
25182    True
25183    True
25184    True
Name: preds, Length: 25185, dtype: bool

In [26]:
#This code done by (Mikio Harman)
# Extract results and create columns for each
w,x,y,z = calc_confuse(new_df_copy)

new_df_copy["FP"] = w
new_df_copy["TP"] = x
new_df_copy["FN"] = y
new_df_copy["TN"] = z

# Look at 10 random rows to determin accuracy
new_df_copy.sample(10)

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN,preds,year,month,day,FP,TP,FN,TN
5524,1964-02-16,0.2,45,38,True,True,1964,2,16,0.0,1.0,0.0,0.0
23830,2014-03-31,0.0,60,36,False,False,2014,3,31,0.0,0.0,0.0,1.0
16185,1993-04-25,0.52,55,45,True,True,1993,4,25,0.0,1.0,0.0,0.0
20090,2004-01-03,0.02,36,23,True,True,2004,1,3,0.0,1.0,0.0,0.0
22418,2010-05-19,0.26,68,46,True,True,2010,5,19,0.0,1.0,0.0,0.0
13369,1985-08-09,0.03,70,55,True,False,1985,8,9,0.0,0.0,1.0,0.0
13349,1985-07-20,0.0,91,61,False,False,1985,7,20,0.0,0.0,0.0,1.0
16201,1993-05-11,0.0,68,48,False,False,1993,5,11,0.0,0.0,0.0,1.0
5884,1965-02-10,0.02,44,34,True,False,1965,2,10,0.0,0.0,1.0,0.0
22517,2010-08-26,0.0,69,55,False,True,2010,8,26,1.0,0.0,0.0,0.0


In [27]:
(sum(new_df_copy["TP"]) + sum(new_df_copy["TN"]))/ len(new_df_copy)

0.5364304149295216

# Third One 

In [30]:
# Another algorithm 
#deleting the four columns 
del new_df_copy['FP']
del new_df_copy['TP']
del new_df_copy['FN']
del new_df_copy['TN']
new_df_copy


Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN,preds,year,month,day
0,1949-01-01,0.00,35,26,False,True,1949,1,1
1,1949-01-02,0.03,40,22,True,True,1949,1,2
2,1949-01-03,0.00,40,17,False,True,1949,1,3
3,1949-01-04,0.00,42,20,False,True,1949,1,4
4,1949-01-05,0.00,41,28,False,True,1949,1,5
...,...,...,...,...,...,...,...,...,...
25180,2017-12-10,0.00,49,34,False,True,2017,12,10
25181,2017-12-11,0.00,49,29,False,True,2017,12,11
25182,2017-12-12,0.00,46,32,False,True,2017,12,12
25183,2017-12-13,0.00,48,34,False,True,2017,12,13


In [35]:
##### The fourth attempt
def heuristic_fourth(df_date):
    """
    Simple heuristic:
    
    it takes per each month and insert the prediction values 
    """
 
    
    
#     df_date.iloc[12]["RAIN"]
    preds = []
    
    for x in range(len(df_date)):
        if x<1:
            preds.append(False)
            
        else :
        
            if (df.iloc[x-1]["RAIN"] != df.iloc[x-2]["RAIN"] == True):

                if (df_date.iloc[x]["month"] == 1):
                    preds.append(True)
                elif df_date.iloc[x]["month"] == 2:
                    preds.append(True)
                elif df_date.iloc[x]["month"] == 3:
                    preds.append(True)
                elif df_date.iloc[x]["month"] == 4:
                    preds.append(False )
                elif df_date.iloc[x]["month"] == 5:
                    preds.append(False )
                elif df_date.iloc[x]["month"] == 6:
                    preds.append(False )
                elif df_date.iloc[x]["month"] == 7:
                    preds.append(False )
                elif df_date.iloc[x]["month"] == 8:
                    preds.append(False )
                elif df_date.iloc[x]["month"] == 9:
                    preds.append(False )
                elif df_date.iloc[x]["month"] == 10:
                    preds.append(False )
                elif df_date.iloc[x]["month"] == 11:
                    preds.append(True)
                elif df_date.iloc[x]["month"] == 12:
                    preds.append(True)

            elif (df.iloc[x-1]["RAIN"] == True):
                preds.append(True)
            else:
                preds.append(False)
            
            
    return preds

In [36]:
new_df_copy["preds"] = heuristic_fourth(new_df_copy)
new_df_copy["preds"]

0        False
1         True
2         True
3         True
4         True
         ...  
25180     True
25181     True
25182     True
25183     True
25184     True
Name: preds, Length: 25185, dtype: bool

In [37]:
#This code done by (Mikio Harman)
# Extract results and create columns for each
w,x,y,z = calc_confuse(new_df_copy)

new_df_copy["FP"] = w
new_df_copy["TP"] = x
new_df_copy["FN"] = y
new_df_copy["TN"] = z

# Look at 10 random rows to determin accuracy
new_df_copy.sample(10)

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN,preds,year,month,day,FP,TP,FN,TN
23426,2013-02-20,0.06,46,34,True,True,2013,2,20,0.0,1.0,0.0,0.0
11512,1980-07-09,0.15,64,52,True,False,1980,7,9,0.0,0.0,1.0,0.0
21717,2008-06-17,0.0,62,48,False,False,2008,6,17,0.0,0.0,0.0,1.0
23959,2014-08-07,0.0,78,56,False,False,2014,8,7,0.0,0.0,0.0,1.0
9484,1974-12-20,0.6,54,47,True,True,1974,12,20,0.0,1.0,0.0,0.0
2461,1955-09-28,0.07,58,46,True,False,1955,9,28,0.0,0.0,1.0,0.0
14686,1989-03-18,0.29,51,41,True,False,1989,3,18,0.0,0.0,1.0,0.0
14621,1989-01-12,0.12,46,42,True,True,1989,1,12,0.0,1.0,0.0,0.0
12459,1983-02-11,0.14,59,46,True,False,1983,2,11,0.0,0.0,1.0,0.0
20152,2004-03-05,0.13,52,40,True,False,2004,3,5,0.0,0.0,1.0,0.0


In [38]:
(sum(new_df_copy["TP"]) + sum(new_df_copy["TN"]))/ len(new_df_copy)

0.5703394877903514