In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

In [2]:
### Importing the Dataset
data = pd.read_table("http://bit.ly/movieusers")
data.head(2)

Unnamed: 0,1|24|M|technician|85711
0,2|53|F|other|94043
1,3|23|M|writer|32067


In [3]:
data = pd.read_table("http://bit.ly/movieusers",sep= "|")
data.head(2)

Unnamed: 0,1,24,M,technician,85711
0,2,53,F,other,94043
1,3,23,M,writer,32067


In [4]:
data = pd.read_table("http://bit.ly/movieusers",sep= "|",header=None)
data.head(2)

Unnamed: 0,0,1,2,3,4
0,1,24,M,technician,85711
1,2,53,F,other,94043


### Handling the Missing Values

In [5]:
df = pd.read_csv("https://raw.githubusercontent.com/Mounika-Kajjam/Datasets/master/weather_data.csv")
df        

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,-99999,7,Sunny
2,1/3/2017,28,-99999,Snow
3,1/4/2017,-99999,7,0
4,1/5/2017,32,-99999,Rain
5,1/6/2017,31,2,Sunny
6,1/6/2017,34,5,0


In [6]:
df.day.dtype

dtype('O')

In [7]:
type(df.day[0])

str

In [8]:
df['temperature'] #Mode

0       32
1   -99999
2       28
3   -99999
4       32
5       31
6       34
Name: temperature, dtype: int64

In [9]:
df['temperature'].value_counts() 

-99999    2
 32       2
 31       1
 28       1
 34       1
Name: temperature, dtype: int64

In [10]:
# Checking for duplicate values
df[df.duplicated()]

Unnamed: 0,day,temperature,windspeed,event


In [11]:
df['temperature'].duplicated()

0    False
1    False
2    False
3     True
4     True
5    False
6    False
Name: temperature, dtype: bool

In [12]:
df[df['temperature'].duplicated()]

Unnamed: 0,day,temperature,windspeed,event
3,1/4/2017,-99999,7,0
4,1/5/2017,32,-99999,Rain


In [13]:
df[df['event'].duplicated()]

Unnamed: 0,day,temperature,windspeed,event
4,1/5/2017,32,-99999,Rain
5,1/6/2017,31,2,Sunny
6,1/6/2017,34,5,0


In [14]:
df['event'].value_counts()

Sunny    2
0        2
Rain     2
Snow     1
Name: event, dtype: int64

In [15]:
df = pd.read_csv("https://raw.githubusercontent.com/Mounika-Kajjam/Datasets/master/weather_data.csv",parse_dates =['day'])
df     

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32,6,Rain
1,2017-01-02,-99999,7,Sunny
2,2017-01-03,28,-99999,Snow
3,2017-01-04,-99999,7,0
4,2017-01-05,32,-99999,Rain
5,2017-01-06,31,2,Sunny
6,2017-01-06,34,5,0


In [16]:
df.day.dtype

dtype('<M8[ns]')

In [17]:
type(df.day[0])

pandas._libs.tslib.Timestamp

In [18]:
new_df = df.replace({'temperature': -99999, 'windspeed': -99999, 'event': '0'},np.nan)
new_df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,,7.0,Sunny
2,2017-01-03,28.0,,Snow
3,2017-01-04,,7.0,
4,2017-01-05,32.0,,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,


### Imputation: Imputation is a process of substituting the missing values

In [19]:
new_df.isnull().sum()

day            0
temperature    2
windspeed      2
event          2
dtype: int64

In [20]:
# fillna() is a function which is used to fill the missing values
data = new_df.fillna(0) #shift_Tab 4 times for documentation to be appeared in the kernel
data

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,0.0,7.0,Sunny
2,2017-01-03,28.0,0.0,Snow
3,2017-01-04,0.0,7.0,0
4,2017-01-05,32.0,0.0,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,0


In [21]:
f_data = new_df.fillna(method = 'ffill')
f_data

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,32.0,7.0,Sunny
2,2017-01-03,28.0,7.0,Snow
3,2017-01-04,28.0,7.0,Snow
4,2017-01-05,32.0,7.0,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,Sunny


In [22]:
## bfill-- backfill
b_data = new_df.fillna(method='bfill')
b_data

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,28.0,7.0,Sunny
2,2017-01-03,28.0,7.0,Snow
3,2017-01-04,32.0,7.0,Rain
4,2017-01-05,32.0,2.0,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,


In [23]:
new_df.mean()

temperature    31.4
windspeed       5.4
dtype: float64

In [24]:
# Mean Imputation
mean_data = new_df.fillna(new_df.mean()) #fill the missing values in the column with the column's mean
mean_data

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,31.4,7.0,Sunny
2,2017-01-03,28.0,5.4,Snow
3,2017-01-04,31.4,7.0,
4,2017-01-05,32.0,5.4,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,


In [25]:
new_df.median()

temperature    32.0
windspeed       6.0
dtype: float64

In [26]:
## Median Imputation
median_data = new_df.fillna(new_df.median())
median_data

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,32.0,7.0,Sunny
2,2017-01-03,28.0,6.0,Snow
3,2017-01-04,32.0,7.0,
4,2017-01-05,32.0,6.0,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,


In [27]:
new_df.event.mode()

0     Rain
1    Sunny
dtype: object

In [28]:
new_df.event.mode().iloc[1]

'Sunny'

In [29]:
new_df.event.mode()

0     Rain
1    Sunny
dtype: object

In [30]:
new_df['event'] = new_df['event'].fillna(new_df.event.mode().iloc[0])
new_df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-02,,7.0,Sunny
2,2017-01-03,28.0,,Snow
3,2017-01-04,,7.0,Rain
4,2017-01-05,32.0,,Rain
5,2017-01-06,31.0,2.0,Sunny
6,2017-01-06,34.0,5.0,Rain


In [31]:
event_data = new_df['event'].fillna(new_df.event.mode().iloc[0])
print(event_data)

0     Rain
1    Sunny
2     Snow
3     Rain
4     Rain
5    Sunny
6     Rain
Name: event, dtype: object


### Problem solving

In [33]:
import numpy as np
arr1 = np.array([[19,7,19,93],
               [94,18,4,22],
               [5,21,95,17],
               [20,92,20,6]])

In [41]:
#sum of corner elements sum is 138
arr1[0,0]+arr1[0,3]+arr1[3,0]+arr1[3,3]

138

In [44]:
#sum of first square elements
print(arr1[0,0]+arr1[0,1]+arr1[1,0]+arr1[1,1])
s=0
for i in range(2):
    for j in range(2):
        s += arr1[i,j]
print(s)

138
138


In [43]:
f = 0
b = 0
for i in range(4):
    for j in range(4):
        if i==j:
            f += arr1[i,j]
        if i+j == 3:
            b += arr1[i,j]
print(f)
print(b)

138
138


In [40]:
#sum of middle square is 138
s = 0
for i in range(1,3):
    for j in range(1,3):
        if i!=0 and i!=3 and j!=0 and j!=3:
            s += arr1[i,j]
s
            

138

In [34]:
#sum of columns is 138
arr1.sum(axis=1)

array([138, 138, 138, 138])

In [35]:
#sum of rows is 138
arr1.sum(axis=0)

array([138, 138, 138, 138])

In [36]:
#forward diagonal sum
arr1.trace(axis1=1,axis2=0)

138

In [38]:
arr1.trace(axis1=0,axis2=1)

138