**Importing Libraries**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

**Loading Dataset**

In [2]:
df = pd.read_csv("domestic_visitors_2016.csv")
df1 = pd.read_csv("domestic_visitors_2017.csv")
df2 = pd.read_csv("domestic_visitors_2018.csv")
df3 = pd.read_csv("domestic_visitors_2019.csv")

**Creating Helping Functions**

In [3]:
def head(n):
    return n.head()

def shape(n):
    return n.shape

def null(n):
    return n.isnull().sum()

def inf(n):
    n.info()

def dupli(n):
    return n.duplicated().sum()

def drpna(n):
    n.dropna(inplace=True)

**Top 5 Rows of all Dataset**

In [4]:
print("2016 \n", head(df))
print("\n")
print("2017 \n", head(df1))
print("\n")
print("2018 \n", head(df2))
print("\n")
print("2019 \n", head(df3))

2016 
    district        date     month  year visitors
0  Adilabad  01-01-2016   January  2016   792136
1  Adilabad  01-02-2016  February  2016   937820
2  Adilabad  01-03-2016     March  2016   582946
3  Adilabad  01-04-2016     April  2016   341948
4  Adilabad  01-05-2016       May  2016   252887


2017 
    district        date     month  year visitors
0  Adilabad  01-01-2017   January  2017   318799
1  Adilabad  01-02-2017  February  2017    83316
2  Adilabad  01-03-2017     March  2017    27508
3  Adilabad  01-04-2017     April  2017    13946
4  Adilabad  01-05-2017       May  2017    11752


2018 
    district        date     month  year visitors
0  Adilabad  01-01-2018   January  2018   320356
1  Adilabad  01-02-2018  February  2018    36550
2  Adilabad  01-03-2018     March  2018    23011
3  Adilabad  01-04-2018     April  2018    14183
4  Adilabad  01-05-2018       May  2018     8197


2019 
    district        date     month  year visitors
0  Adilabad  01-01-2019   January  

**Printing Total Rows and Columns of All Dataset**

In [5]:
print("2016 \n")
print("Total Number of Rows    : ", df.shape[0])
print("Total Number of Columns : ", df.shape[1])
print("\n")

print("2017 \n")
print("Total Number of Rows    : ", df1.shape[0])
print("Total Number of Columns : ", df1.shape[1])
print("\n")

print("2018 \n")
print("Total Number of Rows    : ", df2.shape[0])
print("Total Number of Columns : ", df2.shape[1])
print("\n")

print("2019 \n")
print("Total Number of Rows    : ", df3.shape[0])
print("Total Number of Columns : ", df3.shape[1])

2016 

Total Number of Rows    :  372
Total Number of Columns :  5


2017 

Total Number of Rows    :  372
Total Number of Columns :  5


2018 

Total Number of Rows    :  372
Total Number of Columns :  5


2019 

Total Number of Rows    :  396
Total Number of Columns :  5


**Printing the Null Values of All Dataset**

In [6]:
print("2016 \n")
print(null(df))
print("\n")

print("2017 \n")
print(null(df1))
print("\n")

print("2018 \n")
print(null(df2))
print("\n")

print("2019 \n")
print(null(df3))

2016 

district    0
date        0
month       0
year        0
visitors    0
dtype: int64


2017 

district    0
date        0
month       0
year        0
visitors    0
dtype: int64


2018 

district     0
date         0
month        0
year         0
visitors    12
dtype: int64


2019 

district     0
date         0
month        0
year         0
visitors    18
dtype: int64


**Findings**

We have null values in 2018 and 2019 dataset in visitors column

I decided to remove it instead of impute it because its a visitors column which means that day have holiday and visitors won't visit the place.

In [7]:
df2.dropna(inplace = True)
df3.dropna(inplace = True)

In [8]:
print("2018 \n")
print(null(df2))
print("\n")

print("2019 \n")
print(null(df3))

2018 

district    0
date        0
month       0
year        0
visitors    0
dtype: int64


2019 

district    0
date        0
month       0
year        0
visitors    0
dtype: int64


**Printing the Duplicates Values**

In [9]:
print("2016 \n", dupli(df))
print("2017 \n", dupli(df1))
print("2018 \n", dupli(df2))
print("2019 \n", dupli(df3))

2016 
 0
2017 
 0
2018 
 0
2019 
 0


**Removing the White Space**

In [10]:
df.visitors = df.visitors.str.replace(' ','')
df1.visitors = df1.visitors.str.replace(' ','')
df2.visitors = df2.visitors.str.replace(' ','')
df3.visitors = df3.visitors.str.replace(' ','')

**Converting the dtypes of columns**

In [11]:
df['date'] = pd.to_datetime(df['date'])
df1['date'] = pd.to_datetime(df1['date'])
df2['date'] = pd.to_datetime(df2['date'])
df3['date'] = pd.to_datetime(df3['date'])


df['year'] = df['year'].astype('int32')
df1['year'] = df1['year'].astype('int32')
df2['year'] = df2['year'].astype('int32')
df3['year'] = df3['year'].astype('int32')

In [12]:
df['visitors'] = pd.to_numeric(df['visitors'], errors='coerce')
df1['visitors'] = pd.to_numeric(df1['visitors'], errors='coerce')
df2['visitors'] = pd.to_numeric(df2['visitors'], errors='coerce')
df3['visitors'] = pd.to_numeric(df3['visitors'], errors='coerce')

In [13]:
# replace NaN and inf values with zero

df['visitors'] = df['visitors'].replace([np.nan, np.inf], 0)
df1['visitors'] = df1['visitors'].replace([np.nan, np.inf], 0)
df2['visitors'] = df2['visitors'].replace([np.nan, np.inf], 0)
df3['visitors'] = df3['visitors'].replace([np.nan, np.inf], 0)

In [14]:
# cast the column to int32

df['visitors'] = df['visitors'].astype('int32')
df1['visitors'] = df1['visitors'].astype('int32')
df2['visitors'] = df2['visitors'].astype('int32')
df3['visitors'] = df3['visitors'].astype('int32')

**Printing the Information of the Dataset**

In [15]:
print("2016 \n")
print(inf(df))
print("\n")

print("2017 \n")
print(inf(df1))
print("\n")

print("2018 \n")
print(inf(df2))
print("\n")

print("2019 \n")
print(inf(df3))

2016 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 372 entries, 0 to 371
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   district  372 non-null    object        
 1   date      372 non-null    datetime64[ns]
 2   month     372 non-null    object        
 3   year      372 non-null    int32         
 4   visitors  372 non-null    int32         
dtypes: datetime64[ns](1), int32(2), object(2)
memory usage: 11.8+ KB
None


2017 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 372 entries, 0 to 371
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   district  372 non-null    object        
 1   date      372 non-null    datetime64[ns]
 2   month     372 non-null    object        
 3   year      372 non-null    int32         
 4   visitors  372 non-null    int32         
dtypes: datetime64[ns](1), int32(2), object(2)
memory usa

**Merging all Dataset into one Dataset named Domestic_Visitors**

In [16]:
Domestic_Visitors = pd.concat([df,df1, df2,df3], axis=0)

In [17]:
# Printing the Top 5 Rows of Dataset

head(Domestic_Visitors)

Unnamed: 0,district,date,month,year,visitors
0,Adilabad,2016-01-01,January,2016,792136
1,Adilabad,2016-01-02,February,2016,937820
2,Adilabad,2016-01-03,March,2016,582946
3,Adilabad,2016-01-04,April,2016,341948
4,Adilabad,2016-01-05,May,2016,252887


In [18]:
# Printing the Last 5 Rows of Dataset

Domestic_Visitors.tail()

Unnamed: 0,district,date,month,year,visitors
391,Yadadri Bhongir,2019-01-08,August,2019,389010
392,Yadadri Bhongir,2019-01-09,September,2019,366862
393,Yadadri Bhongir,2019-01-10,October,2019,381860
394,Yadadri Bhongir,2019-01-11,November,2019,365990
395,Yadadri Bhongir,2019-01-12,December,2019,477635


In [19]:
# Printing the Random 10 Rows of Datset

Domestic_Visitors.sample(10)

Unnamed: 0,district,date,month,year,visitors
169,Medak,2016-01-02,February,2016,215000
228,Nizamabad,2017-01-01,January,2017,613
109,Khammam,2017-01-02,February,2017,115470
332,Wanaparthy,2017-01-09,September,2017,35860
30,Hyderabad,2019-01-07,July,2019,1094861
166,Mancherial,2019-01-11,November,2019,29302
83,Jogulamba Gadwal,2017-01-12,December,2017,214887
141,Mahabubabad,2017-01-10,October,2017,13750
326,Wanaparthy,2018-01-03,March,2018,20050
39,Jagtial,2019-01-04,April,2019,447226


In [20]:
# Printing Total Numbers of Rows and Columns


print("Total Number of Rows    : " , Domestic_Visitors.shape[0])
print("Total Number of Columns : " , Domestic_Visitors.shape[1])

Total Number of Rows    :  1482
Total Number of Columns :  5


In [21]:
# Printing the Null Values of the Dataset

null(Domestic_Visitors)

district    0
date        0
month       0
year        0
visitors    0
dtype: int64

In [22]:
# Printing the Duplicated Values

dupli(Domestic_Visitors)

0

In [23]:
# Saving the Domestic_Visitors.csv file

Domestic_Visitors.to_csv("domestic_visitors.csv",index=False)

In [24]:
data = pd.read_csv("domestic_visitors.csv")
data.head()

Unnamed: 0,district,date,month,year,visitors
0,Adilabad,2016-01-01,January,2016,792136
1,Adilabad,2016-01-02,February,2016,937820
2,Adilabad,2016-01-03,March,2016,582946
3,Adilabad,2016-01-04,April,2016,341948
4,Adilabad,2016-01-05,May,2016,252887


In [25]:
data.shape

(1482, 5)

In [26]:
data.isnull().sum()

district    0
date        0
month       0
year        0
visitors    0
dtype: int64