# Pandas
- Pandas is an open source.
- High performance,ease-to-use data structures and data analysis tools for the python programming language.
# DataFrame 
- Dataframe is two-dimensional size-mutable, potentially heterogeneous tabular data structure with labeled axes (rows and columns). 
- A Data frame is a two-dimensional data structure, i.e., data is aligned in a tabular fashion in rows and columns. Pandas DataFrame consists of three principal components, the data, rows, and columns.

In [4]:
# importing pandas
import pandas as pd
import numpy as np

In [5]:
df=pd.DataFrame(np.arange(0,20).reshape(5,4),index=["Row1","Row2","Row3","Row4","Row5"],columns=["Column1","column2","column3","column4"])

In [6]:
df.head()


Unnamed: 0,Column1,column2,column3,column4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [9]:
# Converting it into excel sheet
df.to_csv("test.csv")


In [10]:
#accessing the elements
# 1. .loc:focus on row indexes 2. .iloc: focus on both row and cols
df.loc["Row1"]

Column1    0
column2    1
column3    2
column4    3
Name: Row1, dtype: int32

In [11]:
# series can either be a row or a col
type(df.loc["Row1"])


pandas.core.series.Series

In [15]:
df.iloc[0:2,0:3]

Unnamed: 0,Column1,column2,column3
Row1,0,1,2
Row2,4,5,6


In [17]:
type(df.iloc[0:2,0:3])

pandas.core.frame.DataFrame

### Series can only contain single list with index, whereas dataframe can be made of more than one series or we can say that a dataframe is a collection of series that can be used to analyse the data

In [20]:
df.iloc[0:2,0]

Row1    0
Row2    4
Name: Column1, dtype: int32

In [21]:
type(df.iloc[0:2,0])

pandas.core.series.Series

In [12]:
df.iloc[:,:]

Unnamed: 0,Column1,column2,column3,column4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [23]:

df.iloc[:,1:]

Unnamed: 0,column2,column3,column4
Row1,1,2,3
Row2,5,6,7
Row3,9,10,11
Row4,13,14,15
Row5,17,18,19


In [40]:
df[['Column1','column2']]

Unnamed: 0,Column1,column2
Row1,0,1
Row2,4,5
Row3,8,9
Row4,12,13
Row5,16,17


In [24]:
#converting dataframe into arrays
df.iloc[:,1:].values

array([[ 1,  2,  3],
       [ 5,  6,  7],
       [ 9, 10, 11],
       [13, 14, 15],
       [17, 18, 19]])

In [25]:
df.iloc[:,1:].values.shape

(5, 3)

In [30]:
# checking the null value
df.isnull().sum()

Column1    0
column2    0
column3    0
column4    0
dtype: int64

In [33]:
df["Column1"].value_counts()

12    1
4     1
16    1
8     1
0     1
Name: Column1, dtype: int64

In [34]:
df["Column1"].unique

<bound method Series.unique of Row1     0
Row2     4
Row3     8
Row4    12
Row5    16
Name: Column1, dtype: int32>

In [42]:
# csv file
df=pd.read_csv("Data.csv")

In [43]:
df.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income,Spending Score
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
CustomerID         200 non-null int64
Gender             200 non-null object
Age                200 non-null int64
Annual Income      200 non-null int64
Spending Score     200 non-null int64
dtypes: int64(4), object(1)
memory usage: 7.9+ KB


In [45]:
df.describe()

Unnamed: 0,CustomerID,Age,Annual Income,Spending Score
count,200.0,200.0,200.0,200.0
mean,100.5,38.85,60.56,50.2
std,57.879185,13.969007,26.264721,25.823522
min,1.0,18.0,15.0,1.0
25%,50.75,28.75,41.5,34.75
50%,100.5,36.0,61.5,50.0
75%,150.25,49.0,78.0,73.0
max,200.0,70.0,137.0,99.0


In [47]:
# getting unique category count
df["Age"].value_counts()

32    11
35     9
19     8
31     8
30     7
49     7
27     6
47     6
40     6
23     6
36     6
38     6
50     5
48     5
29     5
21     5
20     5
34     5
18     4
28     4
59     4
24     4
67     4
54     4
39     3
25     3
33     3
22     3
37     3
43     3
68     3
45     3
46     3
60     3
41     2
57     2
66     2
65     2
63     2
58     2
26     2
70     2
42     2
53     2
52     2
51     2
44     2
55     1
64     1
69     1
56     1
Name: Age, dtype: int64

In [49]:
df[df["Age"]>60]

Unnamed: 0,CustomerID,Gender,Age,Annual Income,Spending Score
8,9,Male,64,19,3
10,11,Male,67,19,14
40,41,Female,65,38,35
57,58,Male,69,44,46
60,61,Male,70,46,56
62,63,Female,67,47,52
64,65,Male,63,48,51
67,68,Female,68,48,48
70,71,Male,70,49,55
82,83,Male,67,54,41


In [59]:
from io import StringIO,BytesIO

data=('cols1,cols2,cols3\n'
      'x,y,1\n'
      'a,b,2\n'
      'c,d,3')



In [60]:
type(data)

str

In [61]:
pd.read_csv(StringIO(data))

Unnamed: 0,cols1,cols2,cols3
0,x,y,1
1,a,b,2
2,c,d,3


In [63]:
pd.read_csv(StringIO(data),usecols=["cols1","cols3"])

Unnamed: 0,cols1,cols3
0,x,1
1,a,2
2,c,3


In [76]:
data=('a,b,c,d\n'
      '1,2,3,4\n'
      '4,6,12,7\n'
      '4,5,8,5')


In [82]:
# can set are own datatype
df=pd.read_csv(StringIO(data),dtype=int)

In [83]:
df["a"]

0    1
1    4
2    4
Name: a, dtype: int32

In [86]:
df=pd.read_csv(StringIO(data),dtype={'b':int,'c':float,'d':"Int64"})

In [87]:
df

Unnamed: 0,a,b,c,d
0,1,2,3.0,4
1,4,6,12.0,7
2,4,5,8.0,5


In [88]:
df["a"][1]

4

In [89]:
df.dtypes

a      int64
b      int32
c    float64
d      Int64
dtype: object

In [101]:
data=('index,a,b,c\n'
      '4,apple,bat,5.7\n'
      '8,orange,cow,10\n')

In [102]:
data

'index,a,b,c\n4,apple,bat,5.7\n8,orange,cow,10\n'

In [103]:
df=pd.read_csv(StringIO(data))

In [104]:
df

Unnamed: 0,index,a,b,c
0,4,apple,bat,5.7
1,8,orange,cow,10.0


In [106]:
df=pd.read_csv(StringIO(data),index_col=0)

In [107]:
df

Unnamed: 0_level_0,a,b,c
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,apple,bat,5.7
8,orange,cow,10.0


In [108]:
data=('a,b,c\n'
      '4,apple,bat\n'
      '8,orange,cow\n')

In [109]:
df=pd.read_csv(StringIO(data))

In [110]:
df

Unnamed: 0,a,b,c
0,4,apple,bat
1,8,orange,cow


In [121]:
data='a,b\n,"hello,\\"Bob" \\,nice to see u",5'

In [122]:
df=pd.read_csv(StringIO(data),escapechar='\\')

In [123]:
df

Unnamed: 0,a,b
,"hello,""Bob ,nice to see u""",5
