## This is all about how to apply non graphical univariate analysis on our data.

In [1]:
import pandas as pd


In [2]:
df = pd.read_csv('AB_NYC_2019.csv')

In [4]:
df.tail()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
48901,5441,Central Manhattan/near Broadway,7989,Kate,Manhattan,Hell's Kitchen,40.76076,-73.98867,Private room,85,2,188,23-06-2019,1.5,1,39
48902,5803,"Lovely Room 1, Garden, Best Area, Legal rental",9744,Laurie,Brooklyn,South Slope,40.66829,-73.98779,Private room,89,4,167,24-06-2019,1.34,3,314
48903,6021,Wonderful Guest Bedroom in Manhattan for SINGLES,11528,Claudio,Manhattan,Upper West Side,40.79826,-73.96113,Private room,85,2,113,05-07-2019,0.91,1,333
48904,6090,West Village Nest - Superhost,11975,Alina,Manhattan,West Village,40.7353,-74.00525,Entire home/apt,120,90,27,31-10-2018,0.22,1,0
48905,6848,Only 2 stops to Manhattan studio,15991,Allen & Irina,Brooklyn,Williamsburg,40.70837,-73.95352,Entire home/apt,140,2,148,29-06-2019,1.2,1,46


In [5]:
df = df.dropna()
df = df.drop_duplicates()

## Converting some columns to string

In [7]:
df['id'] = df['id'].astype('str')
df['host_id'] = df['host_id'].astype('str')
df['latitude'] = df['latitude'].astype('str')
df['longitude'] = df['longitude'].astype('str')

In [10]:
df.describe()

Unnamed: 0,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,38821.0,38821.0,38821.0,38821.0,38821.0,38821.0
mean,142.332526,5.86922,29.290255,1.373229,5.166611,114.886299
std,196.994756,17.389026,48.1829,1.680328,26.302954,129.52995
min,0.0,1.0,1.0,0.01,1.0,0.0
25%,69.0,1.0,3.0,0.19,1.0,0.0
50%,101.0,2.0,9.0,0.72,1.0,55.0
75%,170.0,4.0,33.0,2.02,2.0,229.0
max,10000.0,1250.0,629.0,58.5,327.0,365.0


In [11]:
df.nunique()

id                                38821
name                              38244
host_id                           30232
host_name                          9885
neighbourhood_group                   5
neighbourhood                       218
latitude                          17436
longitude                         13639
room_type                             3
price                               581
minimum_nights                       89
number_of_reviews                   393
last_review                        1764
reviews_per_month                   937
calculated_host_listings_count       47
availability_365                    366
dtype: int64

## Categorical - 
let's see what type of analysis we can perform on categorical data (non graphical)

In [12]:
df['neighbourhood_group'].value_counts()

neighbourhood_group
Manhattan        16621
Brooklyn         16439
Queens            4572
Bronx              875
Staten Island      314
Name: count, dtype: int64

In [13]:
df['neighbourhood_group'].value_counts(normalize=True) #presents the earlier data in percentage.

neighbourhood_group
Manhattan        0.428145
Brooklyn         0.423456
Queens           0.117771
Bronx            0.022539
Staten Island    0.008088
Name: proportion, dtype: float64

In [14]:
df['room_type'].value_counts()

room_type
Entire home/apt    20321
Private room       17654
Shared room          846
Name: count, dtype: int64

In [24]:
df['room_type'].value_counts(normalize=True)

room_type
Entire home/apt    0.523454
Private room       0.454754
Shared room        0.021792
Name: proportion, dtype: float64

In [31]:
df_n = df['neighbourhood'].value_counts().reset_index().rename(columns= {'count':'number_of_hotels'})
#here the .reset_index converts the default Series output into a dataframe and .rename converts the column name 'count' to 'number_of_hotels'/

In [34]:
df_n[(df_n['number_of_hotels']>500)] #displays all the neighbourhood that have more than 500 hotels...

Unnamed: 0,neighbourhood,number_of_hotels
0,Williamsburg,3163
1,Bedford-Stuyvesant,3141
2,Harlem,2204
3,Bushwick,1942
4,Hell's Kitchen,1528
5,East Village,1489
6,Upper West Side,1482
7,Upper East Side,1405
8,Crown Heights,1265
9,Midtown,986


## Numerical - 

In [36]:
#now there is a different way to use value_counts with numerical columns - 
# df['price'].value_counts() #using it like this will divide it into too much types of values
#so we can specify that how many parts do we need to divide the data into. 

df['price'].value_counts(bins=5) #this converts the output into 5 categories basically like from 10 to 2000 and so on...

(-10.001, 2000.0]    38786
(2000.0, 4000.0]        20
(4000.0, 6000.0]         8
(8000.0, 10000.0]        5
(6000.0, 8000.0]         2
Name: count, dtype: int64

In [39]:
# now ofcourse we can make our own custom categories like this -
bins = (0,50,100,200,500,2000,10000)
df['price'].value_counts(bins= bins) #now the data is printed in our predefined format

(50.0, 100.0]        14212
(100.0, 200.0]       13544
(200.0, 500.0]        5267
(-0.001, 50.0]        5176
(500.0, 2000.0]        587
(2000.0, 10000.0]       35
Name: count, dtype: int64

In [42]:
#finding out the average price
float(df['price'].mean())

142.33252621004095

In [43]:
#find out the variance/standard deviation of data - 
float(df['price'].std())

196.9947559183469

In [44]:
#find out the skewness of data - 
float(df['price'].skew())

23.673594295123014

In [45]:
#kurtosis - describes the height of the graph 
float(df['price'].kurt())

953.4807356344944

In [58]:
df.corr(numeric_only=True) #this shows the correlations between different numerical columns.

Unnamed: 0,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
price,1.0,0.025501,-0.035924,-0.030623,0.052895,0.078276
minimum_nights,0.025501,1.0,-0.069366,-0.121712,0.073474,0.101658
number_of_reviews,-0.035924,-0.069366,1.0,0.549699,-0.059796,0.193409
reviews_per_month,-0.030623,-0.121712,0.549699,1.0,-0.009442,0.185896
calculated_host_listings_count,0.052895,0.073474,-0.059796,-0.009442,1.0,0.182981
availability_365,0.078276,0.101658,0.193409,0.185896,0.182981,1.0
