In [1]:
# pandas in python
# https://pandas.pydata.org/

# install pandas 
%pip install pandas

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
# import pandas as pd

import pandas as pd

# series

# what is a series
# a series is a one-dimensional labeled array capable of holding any data type
# (integers, strings, floating point numbers, python objects, etc.)
# the axis labels are collectively called the index

# create a series

data = ["JAVA", "Python", "C++", "JavaScript"]

# create a series from a list


s = pd.Series(data)
print(s)

0          JAVA
1        Python
2           C++
3    JavaScript
dtype: object


In [3]:
# create a series from a dictionary

data = {
    "name": "John",
    "age": 30,
    "city": "New York"
}

s = pd.Series(data)
print(s)

name        John
age           30
city    New York
dtype: object


In [5]:
# rename the index

data = ["JAVA", "Python", "C++", "JavaScript"]

s = pd.Series(data, index=["a", "b", "c", "d"])
print(s)

a          JAVA
b        Python
c           C++
d    JavaScript
dtype: object


In [7]:
#  data frame

# what is a data frame
# a data frame is a two-dimensional labeled data structure with columns of potentially different types
# it is similar to a spreadsheet or SQL table, or a dictionary of Series objects
# the data frame has an index and columns


# example 

data = {
    "name": ["John", "Anna", "Peter"],
    "age": [28, 24, 35],
    "city": ["New York", "Paris", "Berlin"]
}

df = pd.DataFrame(data)

print(df)

    name  age      city
0   John   28  New York
1   Anna   24     Paris
2  Peter   35    Berlin


In [8]:
# add index
data = {
    "name": ["John", "Anna", "Peter"],
    "age": [28, 24, 35],
    "city": ["New York", "Paris", "Berlin"]
}
df = pd.DataFrame(data, index=["a", "b", "c"])
print(df)

    name  age      city
a   John   28  New York
b   Anna   24     Paris
c  Peter   35    Berlin


In [9]:
# read csv file

data = pd.read_csv("DataSet/Employee.csv")
print(data)

      Education  JoiningYear       City  PaymentTier  Age  Gender EverBenched  \
0     Bachelors         2017  Bangalore            3   34    Male          No   
1     Bachelors         2013       Pune            1   28  Female          No   
2     Bachelors         2014  New Delhi            3   38  Female          No   
3       Masters         2016  Bangalore            3   27    Male          No   
4       Masters         2017       Pune            3   24    Male         Yes   
...         ...          ...        ...          ...  ...     ...         ...   
4648  Bachelors         2013  Bangalore            3   26  Female          No   
4649    Masters         2013       Pune            2   37    Male          No   
4650    Masters         2018  New Delhi            3   27    Male          No   
4651  Bachelors         2012  Bangalore            3   30    Male         Yes   
4652  Bachelors         2015  Bangalore            3   33    Male         Yes   

      ExperienceInCurrentDo

In [10]:
# print the first 5 rows

print(data.head())

   Education  JoiningYear       City  PaymentTier  Age  Gender EverBenched  \
0  Bachelors         2017  Bangalore            3   34    Male          No   
1  Bachelors         2013       Pune            1   28  Female          No   
2  Bachelors         2014  New Delhi            3   38  Female          No   
3    Masters         2016  Bangalore            3   27    Male          No   
4    Masters         2017       Pune            3   24    Male         Yes   

   ExperienceInCurrentDomain  LeaveOrNot  
0                          0           0  
1                          3           1  
2                          2           0  
3                          5           1  
4                          2           1  


In [11]:
# find the shape of the data frame
print(data.shape)

(4653, 9)


In [12]:
# get all columns name 
print(data.columns)

Index(['Education', 'JoiningYear', 'City', 'PaymentTier', 'Age', 'Gender',
       'EverBenched', 'ExperienceInCurrentDomain', 'LeaveOrNot'],
      dtype='object')


In [14]:
print(data.describe())

       JoiningYear  PaymentTier          Age  ExperienceInCurrentDomain  \
count  4653.000000  4653.000000  4653.000000                4653.000000   
mean   2015.062970     2.698259    29.393295                   2.905652   
std       1.863377     0.561435     4.826087                   1.558240   
min    2012.000000     1.000000    22.000000                   0.000000   
25%    2013.000000     3.000000    26.000000                   2.000000   
50%    2015.000000     3.000000    28.000000                   3.000000   
75%    2017.000000     3.000000    32.000000                   4.000000   
max    2018.000000     3.000000    41.000000                   7.000000   

        LeaveOrNot  
count  4653.000000  
mean      0.343864  
std       0.475047  
min       0.000000  
25%       0.000000  
50%       0.000000  
75%       1.000000  
max       1.000000  


In [15]:
# get the data types of each column
print(data.dtypes)

Education                    object
JoiningYear                   int64
City                         object
PaymentTier                   int64
Age                           int64
Gender                       object
EverBenched                  object
ExperienceInCurrentDomain     int64
LeaveOrNot                    int64
dtype: object


In [19]:
# get specific column
print(data["Education"], data["Gender"])

0       Bachelors
1       Bachelors
2       Bachelors
3         Masters
4         Masters
          ...    
4648    Bachelors
4649      Masters
4650      Masters
4651    Bachelors
4652    Bachelors
Name: Education, Length: 4653, dtype: object 0         Male
1       Female
2       Female
3         Male
4         Male
         ...  
4648    Female
4649      Male
4650      Male
4651      Male
4652      Male
Name: Gender, Length: 4653, dtype: object


In [20]:
# get specific row
print(data.iloc[5])

Education                    Bachelors
JoiningYear                       2016
City                         Bangalore
PaymentTier                          3
Age                                 22
Gender                            Male
EverBenched                         No
ExperienceInCurrentDomain            0
LeaveOrNot                           0
Name: 5, dtype: object


In [21]:
# get specific row and column

print(data.iloc[5, 2])

Bangalore


In [22]:
# find the employee with  age is more than 30

new_data = data[data["Age"] > 30]


# write the new data to a csv file
new_data.to_csv("DataSet/new_employee.csv", index=False)

In [32]:
# convert  year into datetime

data["JoiningYear"] = pd.to_datetime(data["JoiningYear"], format="%Y")
print(data)

      Education JoiningYear       City  PaymentTier  Age  Gender EverBenched  \
0     Bachelors  2017-01-01  Bangalore            3   34    Male          No   
1     Bachelors  2013-01-01       Pune            1   28  Female          No   
2     Bachelors  2014-01-01  New Delhi            3   38  Female          No   
3       Masters  2016-01-01  Bangalore            3   27    Male          No   
4       Masters  2017-01-01       Pune            3   24    Male         Yes   
...         ...         ...        ...          ...  ...     ...         ...   
4648  Bachelors  2013-01-01  Bangalore            3   26  Female          No   
4649    Masters  2013-01-01       Pune            2   37    Male          No   
4650    Masters  2018-01-01  New Delhi            3   27    Male          No   
4651  Bachelors  2012-01-01  Bangalore            3   30    Male         Yes   
4652  Bachelors  2015-01-01  Bangalore            3   33    Male         Yes   

      ExperienceInCurrentDomain  LeaveO

In [34]:
# check the data types of each column
print(data.dtypes)

Education                            object
JoiningYear                  datetime64[ns]
City                                 object
PaymentTier                           int64
Age                                   int64
Gender                               object
EverBenched                          object
ExperienceInCurrentDomain             int64
LeaveOrNot                            int64
dtype: object


In [None]:
# 