In [24]:
import pandas as pd
import numpy as np
import scipy.stats as st
from datetime import datetime as dt


In [8]:
nycdata = pd.read_csv("nyc_weather.csv", parse_dates = ["EST"])
nycdata.head()

Unnamed: 0,EST,Temperature,DewPoint,Humidity,Sea Level PressureIn,VisibilityMiles,WindSpeedMPH,PrecipitationIn,CloudCover,Events,WindDirDegrees
0,2016-01-01,38,23,52,30.03,10,8.0,0,5,,281
1,2016-01-02,36,18,46,30.02,10,7.0,0,3,,275
2,2016-01-03,40,21,47,29.86,10,8.0,0,1,,277
3,2016-01-04,25,9,44,30.05,10,9.0,0,3,,345
4,2016-01-05,20,-3,41,30.57,10,5.0,0,0,,333


In [3]:
nycdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 0 to 30
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   EST                   31 non-null     datetime64[ns]
 1   Temperature           31 non-null     int64         
 2   DewPoint              31 non-null     int64         
 3   Humidity              31 non-null     int64         
 4   Sea Level PressureIn  31 non-null     float64       
 5   VisibilityMiles       31 non-null     int64         
 6   WindSpeedMPH          28 non-null     float64       
 7   PrecipitationIn       31 non-null     object        
 8   CloudCover            31 non-null     int64         
 9   Events                9 non-null      object        
 10  WindDirDegrees        31 non-null     int64         
dtypes: datetime64[ns](1), float64(2), int64(6), object(2)
memory usage: 2.8+ KB


In [4]:
nycdata.isnull().sum()

EST                      0
Temperature              0
DewPoint                 0
Humidity                 0
Sea Level PressureIn     0
VisibilityMiles          0
WindSpeedMPH             3
PrecipitationIn          0
CloudCover               0
Events                  22
WindDirDegrees           0
dtype: int64

In [5]:
nycdata["WindSpeedMPH"].mean()

6.892857142857143

In [7]:
# filling null() fields using mean of column
nycdata['WindSpeedMPH'].fillna(nycdata["WindSpeedMPH"].mean(), inplace=True)
nycdata.isnull().sum()

EST                      0
Temperature              0
DewPoint                 0
Humidity                 0
Sea Level PressureIn     0
VisibilityMiles          0
WindSpeedMPH             0
PrecipitationIn          0
CloudCover               0
Events                  22
WindDirDegrees           0
dtype: int64

In [None]:
# filling null() fields using ffill
nycdata["WindSpeedMPH"].fillna(method = "ffill", inplace=True)

In [9]:
# getting the mode
nycdata["Events"].value_counts()

Rain        4
Snow        3
Fog-Snow    2
Name: Events, dtype: int64

In [10]:
# another way to getting the mode
nycdata["Events"].mode()

0    Rain
dtype: object

In [11]:
# narrowing down mode when you have multiple entries with same value_count
nycdata["Events"].mode()[0]

'Rain'

In [12]:
# dropping null entries
nycdata.dropna(axis=0, how="all", inplace=True)

> Merging

In [14]:
df1 = pd.DataFrame({
    "name" : ["Samuel", "Rodgers", "Vivian", "Miedema"],
    "gender" : ["Male", "Male", "Female", "Female"]
})

df2 = pd.DataFrame({
    "customer name" : ["Vivian", "Miedema", "Tom", "Samuel", "Fred"],
    "age" : [21, 25, 23, 20, 27]
})

In [17]:
df1

Unnamed: 0,name,gender
0,Samuel,Male
1,Rodgers,Male
2,Vivian,Female
3,Miedema,Female


In [18]:
df2

Unnamed: 0,customer name,age
0,Vivian,21
1,Miedema,25
2,Tom,23
3,Samuel,20
4,Fred,27


In [21]:
# merging dataframes : use on - if column names are thesame; left_on, right_on if !same
full_data = pd.merge(left=df1, right=df2, left_on="name", right_on="customer name", how = "inner")
full_data

Unnamed: 0,name,gender,customer name,age
0,Samuel,Male,Samuel,20
1,Vivian,Female,Vivian,21
2,Miedema,Female,Miedema,25


> Dropping duplicates

In [27]:
ecorp = pd.read_csv("../fileInpOut/ecorp data.csv")
ecorp.head()

Unnamed: 0,name,sales,date
0,Sebastian Waite,607193,1/1/2020
1,Lizzie Haggins,1171366,1/1/2020
2,Ima Gaudreau,941504,1/1/2020
3,Leticia Michelsen,721826,1/1/2020
4,Stanley Margolin,904494,1/1/2020


In [23]:
# dropping duplicates in row "name"
ecorp.drop_duplicates(subset="name", keep="first", inplace=True)

In [24]:
ecorp.shape

(549, 3)

In [26]:
# dropping duplicates in row "name"
ecorp.drop_duplicates(subset="date", keep="first", inplace=True)
ecorp.shape

(329, 3)

> Diagnostic Analysis

In [2]:
ecomdata = pd.read_csv("Files/Ecom Expense.csv")
ecomdata.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485


In [14]:
tots = pd.DataFrame(ecomdata.groupby('City Tier')["Total Spend"].sum())
tots

Unnamed: 0_level_0,Total Spend
City Tier,Unnamed: 1_level_1
Tier 1,5096583.0
Tier 2,4798944.0
Tier 3,4661896.0


In [15]:
avgdata = pd.DataFrame(ecomdata.groupby('City Tier')["Total Spend"].mean())
avgdata

Unnamed: 0_level_0,Total Spend
City Tier,Unnamed: 1_level_1
Tier 1,6253.475599
Tier 2,6136.757309
Tier 3,6093.981524


In [21]:
meantots = ecomdata["Total Spend"].mean()
meantots

6163.176415976714

In [22]:
sdtots = ecomdata["Total Spend"].std()
sdtots

2799.7720603578377

In [23]:
x = 5200
zscore = (x-meantots)/sdtots
zscore

-0.34401958274189337