In [1]:
import pandas as pd          # Data handling
import numpy as np           # Numerical operations
from sklearn.preprocessing import StandardScaler, LabelEncoder


In [3]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
columns = ["sepal_length", "sepal_width", "petal_length", "petal_width", "species"]

df = pd.read_csv(url, header=None, names=columns)


In [5]:
df.head()


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [7]:
df.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [9]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [11]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [13]:
df.shape

(150, 5)

In [15]:
df.dtypes


sepal_length    float64
sepal_width     float64
petal_length    float64
petal_width     float64
species          object
dtype: object

<H1>Normalization</H1>

In [17]:
scaler = StandardScaler()

numeric_cols = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])


<h2>Categorical into quantitative</h2>

In [19]:
encoder = LabelEncoder()
df["species"] = encoder.fit_transform(df["species"])


In [21]:
df.head()


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,-0.900681,1.032057,-1.341272,-1.312977,0
1,-1.143017,-0.124958,-1.341272,-1.312977,0
2,-1.385353,0.337848,-1.398138,-1.312977,0
3,-1.506521,0.106445,-1.284407,-1.312977,0
4,-1.021849,1.26346,-1.341272,-1.312977,0


<h2>Integration of data</h2>

In [23]:
df1 = pd.DataFrame({
'id':[1,2,3,4,5],
'Name': ['Ram', 'Sham', 'Sita', 'Geeta', 'Meena'],
'subject_id':['DBMS','TOC','SPOS','MATHS','WT']})
df2 = pd.DataFrame(
{'id':[1,2,3,4,5],
'Name': ['X', 'Y', 'Z', 'A', 'B'],
'subject_id':['DS','TOC','IOT','MATHS','WT']})
print(df1)
print(df2)

   id   Name subject_id
0   1    Ram       DBMS
1   2   Sham        TOC
2   3   Sita       SPOS
3   4  Geeta      MATHS
4   5  Meena         WT
   id Name subject_id
0   1    X         DS
1   2    Y        TOC
2   3    Z        IOT
3   4    A      MATHS
4   5    B         WT


<h2>Merging of data</h2>

In [25]:
print(pd.merge(df1,df2,on='id')) 

   id Name_x subject_id_x Name_y subject_id_y
0   1    Ram         DBMS      X           DS
1   2   Sham          TOC      Y          TOC
2   3   Sita         SPOS      Z          IOT
3   4  Geeta        MATHS      A        MATHS
4   5  Meena           WT      B           WT


In [27]:
print(pd.merge(df1,df2,on=['id','subject_id']))

   id Name_x subject_id Name_y
0   2   Sham        TOC      Y
1   4  Geeta      MATHS      A
2   5  Meena         WT      B


In [29]:
print(pd.merge(df1,df2,on='subject_id',how='left'))

   id_x Name_x subject_id  id_y Name_y
0     1    Ram       DBMS   NaN    NaN
1     2   Sham        TOC   2.0      Y
2     3   Sita       SPOS   NaN    NaN
3     4  Geeta      MATHS   4.0      A
4     5  Meena         WT   5.0      B


In [31]:
print(pd.merge(df1,df2,on='subject_id',how='right'))

   id_x Name_x subject_id  id_y Name_y
0   NaN    NaN         DS     1      X
1   2.0   Sham        TOC     2      Y
2   NaN    NaN        IOT     3      Z
3   4.0  Geeta      MATHS     4      A
4   5.0  Meena         WT     5      B


In [33]:
print(pd.merge(df1,df2,on='subject_id',how='inner'))

   id_x Name_x subject_id  id_y Name_y
0     2   Sham        TOC     2      Y
1     4  Geeta      MATHS     4      A
2     5  Meena         WT     5      B


In [35]:
print(pd.merge(df1,df2,on='subject_id',how='outer'))

   id_x Name_x subject_id  id_y Name_y
0   1.0    Ram       DBMS   NaN    NaN
1   NaN    NaN         DS   1.0      X
2   NaN    NaN        IOT   3.0      Z
3   4.0  Geeta      MATHS   4.0      A
4   3.0   Sita       SPOS   NaN    NaN
5   2.0   Sham        TOC   2.0      Y
6   5.0  Meena         WT   5.0      B


<h2>Concatenation of data</h2>

In [38]:
df3 = pd.DataFrame({
'id':[1,2,3,4,5],
'Name': ['Ram', 'Sham', 'Sita', 'Geeta', 'Meena'],
'subject_id':['DBMS','TOC','SPOS','MATHS','WT']})
df4 = pd.DataFrame(
{'id':[1,2,3,4,5],
'Name': ['X', 'Y', 'Z', 'A', 'B'],
'subject_id':['DS','DELD','IOT','MATHS','WT']})
print(pd.concat([df3,df4]))

   id   Name subject_id
0   1    Ram       DBMS
1   2   Sham        TOC
2   3   Sita       SPOS
3   4  Geeta      MATHS
4   5  Meena         WT
0   1      X         DS
1   2      Y       DELD
2   3      Z        IOT
3   4      A      MATHS
4   5      B         WT


In [40]:
print(pd.concat([df1,df2],keys=['x','y'])) #to associate specific keys

     id   Name subject_id
x 0   1    Ram       DBMS
  1   2   Sham        TOC
  2   3   Sita       SPOS
  3   4  Geeta      MATHS
  4   5  Meena         WT
y 0   1      X         DS
  1   2      Y        TOC
  2   3      Z        IOT
  3   4      A      MATHS
  4   5      B         WT


In [42]:
print(pd.concat([df1,df2],keys=['x','y'], ignore_index=True)) #ignore_index to True.

   id   Name subject_id
0   1    Ram       DBMS
1   2   Sham        TOC
2   3   Sita       SPOS
3   4  Geeta      MATHS
4   5  Meena         WT
5   1      X         DS
6   2      Y        TOC
7   3      Z        IOT
8   4      A      MATHS
9   5      B         WT


In [44]:
#If two objects need to be added along axis=1, then the new columns will be appended
print(pd.concat([df1,df2],axis=1))

   id   Name subject_id  id Name subject_id
0   1    Ram       DBMS   1    X         DS
1   2   Sham        TOC   2    Y        TOC
2   3   Sita       SPOS   3    Z        IOT
3   4  Geeta      MATHS   4    A      MATHS
4   5  Meena         WT   5    B         WT


In [46]:
df5 = pd.DataFrame([[10, 20, 30, 40], [7, 14, 21, 28], [55, 15, 8, 12],
[15, 14, 1, 8], [7, 1, 1, 8], [5, 4, 9, 2]],
columns=['Apple', 'Orange', 'Banana', 'Pear'],
index=['Basket1', 'Basket2', 'Basket3', 'Basket4',
'Basket5', 'Basket6'])
print(df5)

         Apple  Orange  Banana  Pear
Basket1     10      20      30    40
Basket2      7      14      21    28
Basket3     55      15       8    12
Basket4     15      14       1     8
Basket5      7       1       1     8
Basket6      5       4       9     2


In [48]:
print(df5.Apple.mid_range())

AttributeError: 'Series' object has no attribute 'mid_range'

In [56]:
print("\n----------- Calculate Mean -----------\n")
print(df5.mean())


----------- Calculate Mean -----------

Apple     16.500000
Orange    11.333333
Banana    11.666667
Pear      16.333333
dtype: float64


In [58]:
print("\n----------- Calculate Median -----------\n")
print(df5.median())


----------- Calculate Median -----------

Apple      8.5
Orange    14.0
Banana     8.5
Pear      10.0
dtype: float64


In [60]:
print("\n----------- Calculate Mode -----------\n")
print(df5.mode())


----------- Calculate Mode -----------

   Apple  Orange  Banana  Pear
0      7      14       1     8
