# Top 5 Machine Learning Libraries in Python
In this article we are going to learn about __Pandas__.

## Pandas (Python Data Analysis Library)
__Pandas__ is the most popular machine learning library written in python, for data manipulation and analysis. It is a __fast__ and efficient __DataFrame__ object for data manipulation with integrated indexing.

### Creating a Series

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.Series([1,2,3,4,5])

0    1
1    2
2    3
3    4
4    5
dtype: int64

### Creating a DataFrame

In [3]:
# final exam result of 10 students
name = ["Nasir", "Islam", "Sujan", "Sagor", "Jamal", "Rony", "Rana", "Shahin", "Jony", "Sumon"]
math = [99, 58, 30, 40, 70, 77, 83, 68, 23, 0]
english = [78, 67, 34, 33, 32, 21, 45, 89, 95, 10]
physics = [20, 50, 55, 43, 78, 87, 46, 98, 69, 35]

# now we want to create a result DataFrame
result = pd.DataFrame({
    "Name" : name,
    "Math" : math,
    "English": english,
    "Physics" : physics
})

In [4]:
result

Unnamed: 0,Name,Math,English,Physics
0,Nasir,99,78,20
1,Islam,58,67,50
2,Sujan,30,34,55
3,Sagor,40,33,43
4,Jamal,70,32,78
5,Rony,77,21,87
6,Rana,83,45,46
7,Shahin,68,89,98
8,Jony,23,95,69
9,Sumon,0,10,35


### Head and Tail of a DataFrame

In [5]:
print(result.head())

    Name  Math  English  Physics
0  Nasir    99       78       20
1  Islam    58       67       50
2  Sujan    30       34       55
3  Sagor    40       33       43
4  Jamal    70       32       78


In [6]:
print(result.tail())

     Name  Math  English  Physics
5    Rony    77       21       87
6    Rana    83       45       46
7  Shahin    68       89       98
8    Jony    23       95       69
9   Sumon     0       10       35


In [7]:
print(result.head(2))

    Name  Math  English  Physics
0  Nasir    99       78       20
1  Islam    58       67       50


### Statistical Description of DataFrame

In [8]:
result.describe()

Unnamed: 0,Math,English,Physics
count,10.0,10.0,10.0
mean,54.8,50.4,58.1
std,30.741937,29.72541,24.442449
min,0.0,10.0,20.0
25%,32.5,32.25,43.75
50%,63.0,39.5,52.5
75%,75.25,75.25,75.75
max,99.0,95.0,98.0


In [9]:
print(result.describe())

            Math   English    Physics
count  10.000000  10.00000  10.000000
mean   54.800000  50.40000  58.100000
std    30.741937  29.72541  24.442449
min     0.000000  10.00000  20.000000
25%    32.500000  32.25000  43.750000
50%    63.000000  39.50000  52.500000
75%    75.250000  75.25000  75.750000
max    99.000000  95.00000  98.000000


In [10]:
print(result.describe().T)

         count  mean        std   min    25%   50%    75%   max
Math      10.0  54.8  30.741937   0.0  32.50  63.0  75.25  99.0
English   10.0  50.4  29.725410  10.0  32.25  39.5  75.25  95.0
Physics   10.0  58.1  24.442449  20.0  43.75  52.5  75.75  98.0


### Accessing Single & Multiple Column or Attribute

In [11]:
result.Name.head(2)

0    Nasir
1    Islam
Name: Name, dtype: object

In [12]:
result["Name"].head(2)

0    Nasir
1    Islam
Name: Name, dtype: object

In [13]:
result[["Name", "Math"]].head(2)

Unnamed: 0,Name,Math
0,Nasir,99
1,Islam,58


In [14]:
print(result[["Name", "Math"]].tail(2))

    Name  Math
8   Jony    23
9  Sumon     0


In [15]:
# another way of selecting multiple attribute
selected_attr = ["Name", "Math", "English"]
result[selected_attr].head(3)

Unnamed: 0,Name,Math,English
0,Nasir,99,78
1,Islam,58,67
2,Sujan,30,34


### Some basic task with DataFrame

In [16]:
# delete or drop an attribute/column
# del result["Name"]

# Removing multiple attribute/column
# `del` can only remove a single column at a time
# for removing multiple columns we use drop() method
result.drop(["Math", "Physics"], axis=1, inplace=True)

In [17]:
# final exam result of 10 students
name = ["Nasir", "Islam", "Sujan", "Sagor", "Jamal", "Rony", "Rana", "Shahin", "Jony", "Sumon"]
math = [99, 58, 30, 40, 70, 77, 83, 68, 23, 0]
english = [78, 67, 34, 33, 32, 21, 45, 89, 95, 10]
physics = [20, 50, 55, 43, 78, 87, 46, 98, 69, 35]

# now we want to create a result DataFrame
result = pd.DataFrame({
    "Name" : name,
    "Math" : math,
    "English": english,
    "Physics" : physics
})

In [18]:
result.rename(columns={
    "Math": "Social Science", 
    "Physics" : "Biology",
    "English": "Chemistry"}, inplace=True)

In [19]:
result

Unnamed: 0,Name,Social Science,Chemistry,Biology
0,Nasir,99,78,20
1,Islam,58,67,50
2,Sujan,30,34,55
3,Sagor,40,33,43
4,Jamal,70,32,78
5,Rony,77,21,87
6,Rana,83,45,46
7,Shahin,68,89,98
8,Jony,23,95,69
9,Sumon,0,10,35


In [20]:
result.shape

(10, 4)

In [21]:
result.columns.values

array(['Name', 'Social Science', 'Chemistry', 'Biology'], dtype=object)

In [22]:
print(result.columns.values)

['Name' 'Social Science' 'Chemistry' 'Biology']


### Condition Search in DataFrame

In [23]:
# find out the observations where Chemistry result is less than 33
print(result["Name"][result["Chemistry"]<33])

4    Jamal
5     Rony
9    Sumon
Name: Name, dtype: object


In [24]:
# find out the names of students who have achieved more than 60 in all subjects.
result["Name"][(result["Chemistry"]>60) & (result["Biology"]>60) & (result["Social Science"]>60)]

7    Shahin
Name: Name, dtype: object

### Reading a `.CSV` file

In [25]:
student_dataset = pd.read_csv(r"../dataset/student_result.csv")
student_dataset.head()

Unnamed: 0,math,bangla,english,result
0,70,80,90,1
1,30,40,50,0
2,50,20,35,0
3,80,33,33,1
4,33,35,36,1


In [26]:
# checking datatypes
student_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 4 columns):
math       15 non-null int64
bangla     15 non-null int64
english    15 non-null int64
result     15 non-null int64
dtypes: int64(4)
memory usage: 560.0 bytes


In [27]:
# result distribution
# 8 student are failed in exam and 7 student are passed
student_dataset["result"].value_counts()

0    8
1    7
Name: result, dtype: int64

In [28]:
# student who have got A+ marks on Math
student_dataset[student_dataset["math"] >= 80]

Unnamed: 0,math,bangla,english,result
3,80,33,33,1


In [29]:
# list of students who have failed in all subjects
student_dataset[(student_dataset["math"]<33) & (student_dataset["bangla"]<33) & (student_dataset["english"]<33)]

Unnamed: 0,math,bangla,english,result
12,0,0,0,0
13,10,10,10,0


In [30]:
# correlation between attributes
student_dataset.corr()

Unnamed: 0,math,bangla,english,result
math,1.0,0.430168,0.526313,0.382474
bangla,0.430168,1.0,0.733799,0.204588
english,0.526313,0.733799,1.0,0.4842
result,0.382474,0.204588,0.4842,1.0


In [31]:
# adding new column
student_dataset["Name"] = result["Name"]

In [32]:
student_dataset

Unnamed: 0,math,bangla,english,result,Name
0,70,80,90,1,Nasir
1,30,40,50,0,Islam
2,50,20,35,0,Sujan
3,80,33,33,1,Sagor
4,33,35,36,1,Jamal
5,32,80,35,0,Rony
6,40,50,21,0,Rana
7,33,35,35,1,Shahin
8,60,23,10,0,Jony
9,33,34,35,1,Sumon


In [33]:
# checking null values 
student_dataset.isnull()

Unnamed: 0,math,bangla,english,result,Name
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
5,False,False,False,False,False
6,False,False,False,False,False
7,False,False,False,False,False
8,False,False,False,False,False
9,False,False,False,False,False


In [34]:
# more precisely checking null values 
print(student_dataset.isnull().any())

math       False
bangla     False
english    False
result     False
Name        True
dtype: bool


In [35]:
print(student_dataset.isnull().sum().any())

True


In [36]:
# Impute null values
student_dataset.fillna("Annonymous", inplace=True)

In [37]:
student_dataset

Unnamed: 0,math,bangla,english,result,Name
0,70,80,90,1,Nasir
1,30,40,50,0,Islam
2,50,20,35,0,Sujan
3,80,33,33,1,Sagor
4,33,35,36,1,Jamal
5,32,80,35,0,Rony
6,40,50,21,0,Rana
7,33,35,35,1,Shahin
8,60,23,10,0,Jony
9,33,34,35,1,Sumon


In [38]:
del student_dataset["Name"]

In [39]:
student_dataset.head(2)

Unnamed: 0,math,bangla,english,result
0,70,80,90,1
1,30,40,50,0


In [40]:
# dropping rows
student_dataset.drop(student_dataset.index[-5:], inplace=True)

In [41]:
student_dataset

Unnamed: 0,math,bangla,english,result
0,70,80,90,1
1,30,40,50,0
2,50,20,35,0
3,80,33,33,1
4,33,35,36,1
5,32,80,35,0
6,40,50,21,0
7,33,35,35,1
8,60,23,10,0
9,33,34,35,1


In [42]:
# insert column ar specific position
student_dataset.insert(0, "Name", result["Name"].values)

In [43]:
student_dataset.head()

Unnamed: 0,Name,math,bangla,english,result
0,Nasir,70,80,90,1
1,Islam,30,40,50,0
2,Sujan,50,20,35,0
3,Sagor,80,33,33,1
4,Jamal,33,35,36,1


In [44]:
# changing the column names again
result.columns = ["Name", "math", "bangla", "english"]

In [45]:
result

Unnamed: 0,Name,math,bangla,english
0,Nasir,99,78,20
1,Islam,58,67,50
2,Sujan,30,34,55
3,Sagor,40,33,43
4,Jamal,70,32,78
5,Rony,77,21,87
6,Rana,83,45,46
7,Shahin,68,89,98
8,Jony,23,95,69
9,Sumon,0,10,35


In [46]:
del student_dataset['result']

### concatenating two DataFrame

In [47]:
combine = pd.concat([student_dataset, result], ignore_index=True)

In [48]:
combine

Unnamed: 0,Name,math,bangla,english
0,Nasir,70,80,90
1,Islam,30,40,50
2,Sujan,50,20,35
3,Sagor,80,33,33
4,Jamal,33,35,36
5,Rony,32,80,35
6,Rana,40,50,21
7,Shahin,33,35,35
8,Jony,60,23,10
9,Sumon,33,34,35
