# Pandas

-  Pandas is a Python package providing fast, flexible, and expressive data structures designed to make working with “relational” or “labeled” data both easy and intuitive.  
-  It aims to be the fundamental high-level building block for doing practical, real world data analysis in Python. 

<h2> Pandas Well Suited for :- </h2>

 - Tabular data with heterogeneously-typed columns, as in an SQL table or Excel spreadsheet 
 - Ordered and unordered time series data 
 - Arbitrary matrix data (homogeneously typed or heterogeneous) with row and column labels 

In [2]:
import pandas as pd

data = pd.read_csv("C://Users//ANKIT GARG//Documents//ML//Class Notes//logistic//Simple_Linear_Regression//Salary_Data.csv")

In [3]:
#data.head() to get data from head
data.tail(7)


Unnamed: 0,YearsExperience,Salary
23,8.2,113812.0
24,8.7,109431.0
25,9.0,105582.0
26,9.5,116969.0
27,9.6,112635.0
28,10.3,122391.0
29,10.5,121872.0


In [4]:
type(data)

pandas.core.frame.DataFrame

In [5]:
data.shape

(30, 2)

In [6]:
data.dtypes

YearsExperience    float64
Salary             float64
dtype: object

In [7]:
data.columns


Index(['YearsExperience', 'Salary'], dtype='object')

In [8]:
# we can change the coloumns name also 
data.columns = ['exp','sal']
data.head()

Unnamed: 0,exp,sal
0,1.1,39343.0
1,1.3,46205.0
2,1.5,37731.0
3,2.0,43525.0
4,2.2,39891.0


In [9]:
data.describe()

Unnamed: 0,exp,sal
count,30.0,30.0
mean,5.313333,76003.0
std,2.837888,27414.429785
min,1.1,37731.0
25%,3.2,56720.75
50%,4.7,65237.0
75%,7.7,100544.75
max,10.5,122391.0


In [10]:
# To access any particular coloumn
data.exp
# data["exp"]

0      1.1
1      1.3
2      1.5
3      2.0
4      2.2
5      2.9
6      3.0
7      3.2
8      3.2
9      3.7
10     3.9
11     4.0
12     4.0
13     4.1
14     4.5
15     4.9
16     5.1
17     5.3
18     5.9
19     6.0
20     6.8
21     7.1
22     7.9
23     8.2
24     8.7
25     9.0
26     9.5
27     9.6
28    10.3
29    10.5
Name: exp, dtype: float64

In [11]:
data.isnull()

Unnamed: 0,exp,sal
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
5,False,False
6,False,False
7,False,False
8,False,False
9,False,False


In [12]:
data.isnull().sum()

exp    0
sal    0
dtype: int64

In [13]:
data.iloc[0:5,0:2]

Unnamed: 0,exp,sal
0,1.1,39343.0
1,1.3,46205.0
2,1.5,37731.0
3,2.0,43525.0
4,2.2,39891.0


# Scikit

-  Scikit focus on modeling data.  
-  Scikit not focused on loading, manipulating and summarizing data.  
-   NumPy and Pandas are used for loading, manipulating and summarizing data features. 

In [54]:
salary = data.copy()

In [15]:
salary

Unnamed: 0,exp,sal
0,1.1,39343.0
1,1.3,46205.0
2,1.5,37731.0
3,2.0,43525.0
4,2.2,39891.0
5,2.9,56642.0
6,3.0,60150.0
7,3.2,54445.0
8,3.2,64445.0
9,3.7,57189.0


In [16]:
temp_slary = salary.drop(0)
temp_slary.head()

# it drops the row for the given index.

Unnamed: 0,exp,sal
1,1.3,46205.0
2,1.5,37731.0
3,2.0,43525.0
4,2.2,39891.0
5,2.9,56642.0


In [17]:
data.index

RangeIndex(start=0, stop=30, step=1)

In [18]:
# drop by label
salary.drop(1,inplace= True)
salary.head(5)

Unnamed: 0,exp,sal
0,1.1,39343.0
2,1.5,37731.0
3,2.0,43525.0
4,2.2,39891.0
5,2.9,56642.0


In [19]:
salary.index[1]

2

In [22]:
salary[salary.exp>4.0]

Unnamed: 0,exp,sal
13,4.1,57081.0
14,4.5,61111.0
15,4.9,67938.0
16,5.1,66029.0
17,5.3,83088.0
18,5.9,81363.0
19,6.0,93940.0
20,6.8,91738.0
21,7.1,98273.0
22,7.9,101302.0


exp        1.1
sal    39343.0
Name: 0, dtype: float64

In [29]:
salary.drop(2,inplace=True)


In [30]:
salary

Unnamed: 0,exp,sal
3,2.0,43525.0
4,2.2,39891.0
5,2.9,56642.0
6,3.0,60150.0
7,3.2,54445.0
8,3.2,64445.0
9,3.7,57189.0
10,3.9,63218.0
11,4.0,55794.0
12,4.0,56957.0


In [31]:
salary.iloc[0]
# For position based


exp        2.0
sal    43525.0
Name: 3, dtype: float64

In [33]:
# For in terms of label based
salary.loc[3]

exp        2.0
sal    43525.0
Name: 3, dtype: float64

In [42]:
salary.loc[0] = [1.3,2400.9]

In [43]:
salary

Unnamed: 0,exp,sal
3,1.3,2400.9
4,2.2,39891.0
5,2.9,56642.0
6,3.0,60150.0
7,3.2,54445.0
8,3.2,64445.0
9,3.7,57189.0
10,3.9,63218.0
11,4.0,55794.0
12,4.0,56957.0


In [44]:
# To reset all index values. but it will create old index as coloumn by default
salary.reset_index()



Unnamed: 0,index,exp,sal
0,3,1.3,2400.9
1,4,2.2,39891.0
2,5,2.9,56642.0
3,6,3.0,60150.0
4,7,3.2,54445.0
5,8,3.2,64445.0
6,9,3.7,57189.0
7,10,3.9,63218.0
8,11,4.0,55794.0
9,12,4.0,56957.0


In [45]:
#To remove the "index" coloumn 
salary.reset_index(drop=True)

Unnamed: 0,exp,sal
0,1.3,2400.9
1,2.2,39891.0
2,2.9,56642.0
3,3.0,60150.0
4,3.2,54445.0
5,3.2,64445.0
6,3.7,57189.0
7,3.9,63218.0
8,4.0,55794.0
9,4.0,56957.0


In [46]:
salary.index

Int64Index([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
            20, 21, 22, 23, 24, 25, 26, 27, 28, 29,  0],
           dtype='int64')

<h2>axis = 1 </h2>

It means look at column wise.

<h2>axis = 0</h2>

It means look at row wise.


In [47]:
salary.drop("exp",axis=1,inplace = True)

In [49]:
salary.head()

Unnamed: 0,sal
3,2400.9
4,39891.0
5,56642.0
6,60150.0
7,54445.0


In [50]:
salary.describe()

Unnamed: 0,sal
count,28.0
mean,75645.992857
std,32130.811631
min,2400.9
25%,57050.0
50%,66983.5
75%,102372.0
max,122391.0


In [51]:
del salary["sal"]

In [53]:
salary.describe()
# If coloumn not exist then it will show error.

ValueError: Cannot describe a DataFrame without columns

In [56]:
salary = data.copy()
salary.head()

Unnamed: 0,exp,sal
0,1.1,39343.0
1,1.3,46205.0
2,1.5,37731.0
3,2.0,43525.0
4,2.2,39891.0


In [58]:
salary["diff_sal_exp"] = salary["sal"] - salary["exp"]
salary.head()

Unnamed: 0,exp,sal,diff_sal_exp
0,1.1,39343.0,39341.9
1,1.3,46205.0,46203.7
2,1.5,37731.0,37729.5
3,2.0,43525.0,43523.0
4,2.2,39891.0,39888.8


In [75]:
import numpy as np

salary.iloc[1:4,1:3] = np.nan

In [70]:
salary

Unnamed: 0,exp,sal,diff_sal_exp
0,1.1,39343.0,39341.9
1,1.3,,
2,1.5,,
3,2.0,,
4,2.2,,
5,2.9,,
6,3.0,,
7,3.2,,
8,3.2,,
9,3.7,,


In [72]:
salary.describe()

Unnamed: 0,exp,sal,diff_sal_exp
count,30.0,21.0,21.0
mean,5.313333,86660.333333,86653.838095
std,2.837888,25612.294853,25609.809219
min,1.1,39343.0,39341.9
25%,3.2,63218.0,63214.1
50%,4.7,91738.0,91731.2
75%,7.7,109431.0,109422.3
max,10.5,122391.0,122380.7


In [74]:
salary.dropna(inplace=True)

In [77]:
salary.head()

Unnamed: 0,exp,sal,diff_sal_exp
0,1.1,39343.0,39341.9
10,3.9,,
11,4.0,,
12,4.0,,
13,4.1,57081.0,57076.9


In [78]:
salary.sal.fillna(salary.sal.mean(),inplace=True)

In [79]:
salary.head()

Unnamed: 0,exp,sal,diff_sal_exp
0,1.1,39343.0,39341.9
10,3.9,91327.666667,
11,4.0,91327.666667,
12,4.0,91327.666667,
13,4.1,57081.0,57076.9


In [80]:
salary.diff_sal_exp.fillna(salary.diff_sal_exp.mean(),inplace=True)
salary.head()

Unnamed: 0,exp,sal,diff_sal_exp
0,1.1,39343.0,39341.9
10,3.9,91327.666667,91320.75
11,4.0,91327.666667,91320.75
12,4.0,91327.666667,91320.75
13,4.1,57081.0,57076.9
