In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns  # visualization tool
import statsmodels as sm

In [2]:
dt = pd.read_csv("train.csv")

In [3]:
dt.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 1.Series

### Series- Creating a Series Object

In [4]:
serie = pd.Series([32,12,43,23], index = ["a", "b", "c", "d"])
serie

a    32
b    12
c    43
d    23
dtype: int64

In [5]:
obj = pd.Series(["32", "12", "54"], index=["George", "Mike", "Dustin"])
obj

George    32
Mike      12
Dustin    54
dtype: object

* dtype changes according to value types


### Series- Selecting

In [6]:
obj["George"]

'32'

In [7]:
index_a = ["George", "Mike"]

In [8]:
obj[index_a]

George    32
Mike      12
dtype: object

In [9]:
index_b = ["George", "Mike", "April"]

In [10]:
obj[index_b]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]


George     32
Mike       12
April     NaN
dtype: object

Error, because there is not April string among obj indexes

In [11]:
obj

George    32
Mike      12
Dustin    54
dtype: object

### Series- Selecting by a filter

In [12]:
obj = obj.astype(int)
obj[obj < 30] # filter

Mike    12
dtype: int32

### Series- Mapping Value

In [13]:
"Mike" in obj # Like dicts with key

True

### Series- Dictionary to Series

In [14]:
dict1 = {"apple"  : 2,
         "banana" : 4,
         " peach" : 6}
dict1


{'apple': 2, 'banana': 4, ' peach': 6}

In [15]:
serie1 = pd.Series(dict1)

In [16]:
serie2 = pd.Series(dict1 , index=["banana", "apple", "avacado"])
serie2

banana     4.0
apple      2.0
avacado    NaN
dtype: float64

### Series- .isnull()

In [17]:
serie2.isnull()

banana     False
apple      False
avacado     True
dtype: bool

### Series- Sum

Pandas doesn't sum unmatched index'es and return NaN value

In [18]:
serie3 = serie + serie2
serie3

a         NaN
apple     NaN
avacado   NaN
b         NaN
banana    NaN
c         NaN
d         NaN
dtype: float64

In [19]:
serie3.sum()


0.0

In [20]:
serie3.isnull()

a          True
apple      True
avacado    True
b          True
banana     True
c          True
d          True
dtype: bool

### Series- Series to Numpy

In [21]:
serie1

apple     2
banana    4
 peach    6
dtype: int64

In [22]:
serie1.values

array([2, 4, 6], dtype=int64)

### Series- Series to Index

In [23]:
serie1.index

Index(['apple', 'banana', ' peach'], dtype='object')

### Series- Changement of index

In [24]:

serie1


apple     2
banana    4
 peach    6
dtype: int64

In [25]:
serie1.index = ["app", "ban", "pea"]
serie1

app    2
ban    4
pea    6
dtype: int64

In [26]:
dt2 = dt.drop(["Name", "Ticket", "Embarked"], axis="columns").head()
dt2


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin
0,1,0,3,male,22.0,1,0,7.25,
1,2,1,1,female,38.0,1,0,71.2833,C85
2,3,1,3,female,26.0,0,0,7.925,
3,4,1,1,female,35.0,1,0,53.1,C123
4,5,0,3,male,35.0,0,0,8.05,


In [27]:
dict = {"Pclass ": dt2.loc[:,"Pclass"],
        "Sex" : dt2.loc[:,"Sex"]}
a = pd.DataFrame(dict)
a

Unnamed: 0,Pclass,Sex
0,3,male
1,1,female
2,3,female
3,1,female
4,3,male


## 2.DataFrame (DF)

### DF - Construct a DataFrame - with dict

In [28]:
data = {"state" : ["Ohio", "Ohio", "Nevada", "Nevada"],
        "year"  : ["2000", "2001", "2002", "2003"],
        "popul"  : [1.5, 1.7, 3.6, 2.4]}
data

{'state': ['Ohio', 'Ohio', 'Nevada', 'Nevada'],
 'year': ['2000', '2001', '2002', '2003'],
 'popul': [1.5, 1.7, 3.6, 2.4]}

In [29]:
frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,popul
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Nevada,2002,3.6
3,Nevada,2003,2.4


In [30]:
frame2 = pd.DataFrame(data, columns=["state", "year", "area"])
frame2

Unnamed: 0,state,year,area
0,Ohio,2000,
1,Ohio,2001,
2,Nevada,2002,
3,Nevada,2003,


DF - Construct a DataFrame - with nested dict¶

In [31]:
data = {"state" : {"2000" : "Ohio", "2001" : "Ohio", "2002" : "Nevada", "2003":"Nevada"},

        "popul" : {"2000" : 1.5, "2001" : 1.7}}
frame2 = pd.DataFrame(data)
frame2

Unnamed: 0,state,popul
2000,Ohio,1.5
2001,Ohio,1.7
2002,Nevada,
2003,Nevada,


### DF- Select Series from DF

In [32]:
frame["state"]

0      Ohio
1      Ohio
2    Nevada
3    Nevada
Name: state, dtype: object

In [33]:
frame.year

0    2000
1    2001
2    2002
3    2003
Name: year, dtype: object

### DF- Adding new column

#### Adding new column by list

In [34]:
frame["new_by_list"] = [3, 4, 5, 2]
frame

Unnamed: 0,state,year,popul,new_by_list
0,Ohio,2000,1.5,3
1,Ohio,2001,1.7,4
2,Nevada,2002,3.6,5
3,Nevada,2003,2.4,2


* If you assign a new column by a list, the number of item must be same with the DataFrame's index number


#### Adding new column by Series

In [35]:
obj = pd.Series([0,0,0,0], index=[0,1,2,4])

In [36]:
frame["new_by_Series"] = obj
frame

Unnamed: 0,state,year,popul,new_by_list,new_by_Series
0,Ohio,2000,1.5,3,0.0
1,Ohio,2001,1.7,4,0.0
2,Nevada,2002,3.6,5,0.0
3,Nevada,2003,2.4,2,


* If you assign a new column by a Series, the index values should be same with the DataFrame's index values
* Otherwise it will be a NaN value

### DF- del DF column

In [37]:
del frame["new_by_list"]

In [38]:
frame


Unnamed: 0,state,year,popul,new_by_Series
0,Ohio,2000,1.5,0.0
1,Ohio,2001,1.7,0.0
2,Nevada,2002,3.6,0.0
3,Nevada,2003,2.4,


* We can del only one column at once
* If you want to del more columns at once you can use drop() method of DataFrame class

### DF- Transpose of DF

In [39]:
frame

Unnamed: 0,state,year,popul,new_by_Series
0,Ohio,2000,1.5,0.0
1,Ohio,2001,1.7,0.0
2,Nevada,2002,3.6,0.0
3,Nevada,2003,2.4,


In [40]:
frame.T

Unnamed: 0,0,1,2,3
state,Ohio,Ohio,Nevada,Nevada
year,2000,2001,2002,2003
popul,1.5,1.7,3.6,2.4
new_by_Series,0,0,0,


### DF- Selecting a mini DF

In [41]:
frame

Unnamed: 0,state,year,popul,new_by_Series
0,Ohio,2000,1.5,0.0
1,Ohio,2001,1.7,0.0
2,Nevada,2002,3.6,0.0
3,Nevada,2003,2.4,


In [42]:
data2 = {"year" : frame["year"][0:2],
         "popul": frame["popul"][frame["popul"]<3]}

In [43]:
frame3 = pd.DataFrame(data2)
frame3

Unnamed: 0,year,popul
0,2000.0,1.5
1,2001.0,1.7
3,,2.4


### DF- Index Object

In [44]:
inx_columns = frame3.columns
inx_columns


Index(['year', 'popul'], dtype='object')

* Columns and index'es are instances of pd.Index class
* Index objects are immutable like tuples

In [45]:
inx2 = pd.Index(['10','20', '30', '40'])


In [49]:
inx2[0]

'10'

In [51]:
#inx[0] = "30"

* It will throw an error because Index objects are immutable like tuples
* It means you can not make any changement over an index object

## 3.Essential Functionality (EF)

### EF - Reindexing

* Reindexing is not used to change index values
* it is used to reorder indexes

In [52]:
serie


a    32
b    12
c    43
d    23
dtype: int64

In [53]:
serie.reindex(["b", "c", "a", "d"])

b    12
c    43
a    32
d    23
dtype: int64

* If there is not any matching index it assign a NaN value

In [55]:
serie.reindex(["b", "c", "a", "p"]) 

b    12.0
c    43.0
a    32.0
p     NaN
dtype: float64

In [56]:
frame

Unnamed: 0,state,year,popul,new_by_Series
0,Ohio,2000,1.5,0.0
1,Ohio,2001,1.7,0.0
2,Nevada,2002,3.6,0.0
3,Nevada,2003,2.4,


In [57]:
frame.reindex([2,3,1,0])

Unnamed: 0,state,year,popul,new_by_Series
2,Nevada,2002,3.6,0.0
3,Nevada,2003,2.4,
1,Ohio,2001,1.7,0.0
0,Ohio,2000,1.5,0.0


### EF - Reindexing - reorder Columns with axis = "columns"

In [58]:
frame.reindex(["popul", "state", "year"], axis = "columns")

Unnamed: 0,popul,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Nevada,2002
3,2.4,Nevada,2003


* It droped automaticaly unassigned columns values 

### EF - Dropping Entries from an Axis

* Drop method is using for deleting spesific entries
* Just as reindex method drop method has an parameter of axis info

In [59]:
frame

Unnamed: 0,state,year,popul,new_by_Series
0,Ohio,2000,1.5,0.0
1,Ohio,2001,1.7,0.0
2,Nevada,2002,3.6,0.0
3,Nevada,2003,2.4,


In [62]:
frame.drop([0,1]) # It didnt change real dF because it is a copy of the frame

Unnamed: 0,state,year,popul,new_by_Series
2,Nevada,2002,3.6,0.0
3,Nevada,2003,2.4,


In [63]:
frame.drop(["state","year"], axis="columns")

Unnamed: 0,popul,new_by_Series
0,1.5,0.0
1,1.7,0.0
2,3.6,0.0
3,2.4,


In [64]:
frame

Unnamed: 0,state,year,popul,new_by_Series
0,Ohio,2000,1.5,0.0
1,Ohio,2001,1.7,0.0
2,Nevada,2002,3.6,0.0
3,Nevada,2003,2.4,


### EF - Slicing with Series, Selection and Filtering

#### EF - Slicing with Series

In [71]:
frame.index = inx2
frame

Unnamed: 0,state,year,popul,new_by_Series
10,Ohio,2000,1.5,0.0
20,Ohio,2001,1.7,0.0
30,Nevada,2002,3.6,0.0
40,Nevada,2003,2.4,


* End point is exclusive

In [72]:
frame["year"][0:3]

10    2000
20    2001
30    2002
Name: year, dtype: object

* At the Slicing with label end point is inclusive

In [73]:
frame["year"]["10":"30"]

10    2000
20    2001
30    2002
Name: year, dtype: object

* We can assign new values with slicing

In [80]:
frame["year"]["10":"30"] = ["3000", "3001", "3002"]
frame


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,state,year,popul,new_by_Series
10,Ohio,3000,1.5,0.0
20,Ohio,3001,1.7,0.0
30,Nevada,3002,3.6,0.0
40,Nevada,2003,2.4,


* It throwed a warning but did what we command and change values sliced
* But if you want to change more than one column it wont work with slice
* We should use .loc and .iloc special indexing operators

### EF - Selecting with loc and iloc

* with loc spesific location by index and column names
* with iloc spesific location by index and column index

In [82]:
frame

Unnamed: 0,state,year,popul,new_by_Series
10,Ohio,3000,1.5,0.0
20,Ohio,3001,1.7,0.0
30,Nevada,3002,3.6,0.0
40,Nevada,2003,2.4,


* loc

In [91]:
frame.loc["10" : "40", "state" : "popul" ]

Unnamed: 0,state,year,popul
10,Ohio,3000,1.5
20,Ohio,3001,1.7
30,Nevada,3002,3.6
40,Nevada,2003,2.4


In [92]:
frame.loc[["10", "40"], ["state", "popul"]]

Unnamed: 0,state,popul
10,Ohio,1.5
40,Nevada,2.4


* iloc

In [98]:
frame.iloc[[0,3], [0, 2]]

Unnamed: 0,state,popul
10,Ohio,1.5
40,Nevada,2.4


In [102]:
frame.iloc[0:3, 0:2]

Unnamed: 0,state,year
10,Ohio,3000
20,Ohio,3001
30,Nevada,3002


#### Example

In [103]:
frame

Unnamed: 0,state,year,popul,new_by_Series
10,Ohio,3000,1.5,0.0
20,Ohio,3001,1.7,0.0
30,Nevada,3002,3.6,0.0
40,Nevada,2003,2.4,


In [105]:
frame.iloc[:, 0:3][frame.popul <2.3]

Unnamed: 0,state,year,popul
10,Ohio,3000,1.5
20,Ohio,3001,1.7


### EF - Arithmetic

* Arithmetic operations are conducted with index based
* Like reindex operations if there is no any match index pandas assign NaN value

In [115]:
df1 = pd.DataFrame(np.arange(9.).reshape(3,3), index = list("abc"), columns = ["Car", "Bicycle", "Plain"])
df2 = pd.DataFrame(np.arange(16.).reshape(4,4), index = list("abcd"), columns = ["Car", "Bicycle", "Motor","Plain"])
                                                                               

In [117]:
df1

Unnamed: 0,Car,Bicycle,Plain
a,0.0,1.0,2.0
b,3.0,4.0,5.0
c,6.0,7.0,8.0


In [118]:
df2

Unnamed: 0,Car,Bicycle,Motor,Plain
a,0.0,1.0,2.0,3.0
b,4.0,5.0,6.0,7.0
c,8.0,9.0,10.0,11.0
d,12.0,13.0,14.0,15.0


In [116]:
df1 + df2

Unnamed: 0,Bicycle,Car,Motor,Plain
a,2.0,0.0,,5.0
b,9.0,7.0,,12.0
c,16.0,14.0,,19.0
d,,,,


* DataFrame() class has some methods for arithmetic operations like x.add(y)
* add method has an fill_value parameter for NaN values in dataframes **NOT FOR NaN VALUES İN RESULT FRAME 

In [129]:
df3 = pd.DataFrame({"A" : [1,3,4],
                    "B" : [3,5,6]})
df4 = pd.DataFrame({"A" : [42,65,33],
                    "B" : [11,22,17]})

In [133]:
df3.iloc[[1,2], 1] = np.nan
df3

Unnamed: 0,A,B
0,1,3.0
1,3,
2,4,


In [134]:
df4

Unnamed: 0,A,B
0,42,11
1,65,22
2,33,17


In [139]:
df3.add(df4, fill_value = 0)


Unnamed: 0,A,B
0,45,20.0
1,74,22.0
2,45,17.0


### EF - Function Application and Mapping

In [141]:
df4

Unnamed: 0,A,B
0,44,17.0
1,71,22.0
2,41,17.0


In [140]:
f = lambda x : max(x) 
df4.apply(f)

A    71.0
B    22.0
dtype: float64

In [142]:
df4.apply(f, axis="columns")

0    44.0
1    71.0
2    41.0
dtype: float64

In [145]:
df5 = pd.DataFrame(np.random.randn(3,3), index = list("abc"), columns = ["Car", "Bicycle", "Plain"])
df5

Unnamed: 0,Car,Bicycle,Plain
a,-1.55362,0.689026,0.685424
b,-0.22744,0.587654,-0.575993
c,-1.680923,0.032477,-0.841525


In [147]:
format = lambda x: '%.2f' % x
df5.applymap(format)

Unnamed: 0,Car,Bicycle,Plain
a,-1.55,0.69,0.69
b,-0.23,0.59,-0.58
c,-1.68,0.03,-0.84


### EF - Sorting Series and DataFrames

#### EF - Sorting - .sort_index( , ascending = True)

In [155]:
serie1


app    2
ban    4
pea    6
dtype: int64

In [157]:
serie1.sort_index()

app    2
ban    4
pea    6
dtype: int64

In [158]:
serie1.sort_index(ascending=False)

pea    6
ban    4
app    2
dtype: int64

In [170]:
frame4 = frame
frame4.columns = list("bcad")
frame4.index = [30, 40, 50, 10]
frame4

Unnamed: 0,b,c,a,d
30,Ohio,3000,1.5,0.0
40,Ohio,3001,1.7,0.0
50,Nevada,3002,3.6,0.0
10,Nevada,2003,2.4,


In [171]:
frame4.sort_index()

Unnamed: 0,b,c,a,d
10,Nevada,2003,2.4,
30,Ohio,3000,1.5,0.0
40,Ohio,3001,1.7,0.0
50,Nevada,3002,3.6,0.0


In [172]:
frame4.sort_index(axis="columns")

Unnamed: 0,a,b,c,d
30,1.5,Ohio,3000,0.0
40,1.7,Ohio,3001,0.0
50,3.6,Nevada,3002,0.0
10,2.4,Nevada,2003,


#### EF - Sorting - .sort_values( , ascending = True)

In [175]:
frame4.sort_values(by="a", ascending=False)

Unnamed: 0,b,c,a,d
50,Nevada,3002,3.6,0.0
10,Nevada,2003,2.4,
40,Ohio,3001,1.7,0.0
30,Ohio,3000,1.5,0.0


In [181]:
frame4.loc[[10,30], "a"] = [3.6, 1.7]
frame4.sort_values(by="a", ascending=False)

Unnamed: 0,b,c,a,d
50,Nevada,3002,3.6,0.0
10,Nevada,2003,3.6,
30,Ohio,3000,1.7,0.0
40,Ohio,3001,1.7,0.0


In [183]:
frame4.sort_values(by=["a", "c"], ascending=True)

Unnamed: 0,b,c,a,d
30,Ohio,3000,1.7,0.0
40,Ohio,3001,1.7,0.0
10,Nevada,2003,3.6,
50,Nevada,3002,3.6,0.0


## 3.Essential Functionality (EF)