In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns  # visualization tool
import statsmodels as sm

In [2]:
dt = pd.read_csv("train.csv")

In [3]:
dt.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 1.Series

### Series- Creating a Series Object

In [4]:
serie = pd.Series([32,12,43,23], index = ["a", "b", "c", "d"])
serie

a    32
b    12
c    43
d    23
dtype: int64

In [5]:
obj = pd.Series(["32", "12", "54"], index=["George", "Mike", "Dustin"])
obj

George    32
Mike      12
Dustin    54
dtype: object

* dtype changes according to value types


### Series- Selecting

In [6]:
obj["George"]

'32'

In [7]:
index_a = ["George", "Mike"]

In [8]:
obj[index_a]

George    32
Mike      12
dtype: object

In [9]:
index_b = ["George", "Mike", "April"]

In [10]:
obj[index_b]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]


George     32
Mike       12
April     NaN
dtype: object

Error, because there is not April string among obj indexes

In [11]:
obj

George    32
Mike      12
Dustin    54
dtype: object

### Series- Selecting by a filter

In [12]:
obj = obj.astype(int)
obj[obj < 30] # filter

Mike    12
dtype: int32

### Series- Mapping Value

In [13]:
"Mike" in obj # Like dicts with key

True

### Series- Dictionary to Series

In [14]:
dict1 = {"apple"  : 2,
         "banana" : 4,
         " peach" : 6}
dict1


{'apple': 2, 'banana': 4, ' peach': 6}

In [15]:
serie1 = pd.Series(dict1)

In [16]:
serie2 = pd.Series(dict1 , index=["banana", "apple", "avacado"])
serie2

banana     4.0
apple      2.0
avacado    NaN
dtype: float64

### Series- .isnull()

In [17]:
serie2.isnull()

banana     False
apple      False
avacado     True
dtype: bool

### Series- Sum

Pandas doesn't sum unmatched index'es and return NaN value

In [18]:
serie3 = serie + serie2
serie3

a         NaN
apple     NaN
avacado   NaN
b         NaN
banana    NaN
c         NaN
d         NaN
dtype: float64

In [19]:
serie3.sum()


0.0

In [20]:
serie3.isnull()

a          True
apple      True
avacado    True
b          True
banana     True
c          True
d          True
dtype: bool

### Series- Unique()

In [21]:
serie.unique()

array([32, 12, 43, 23], dtype=int64)

### Series- Series to Numpy

In [22]:
serie1

apple     2
banana    4
 peach    6
dtype: int64

In [23]:
serie1.values

array([2, 4, 6], dtype=int64)

### Series- Series to Index

In [24]:
serie1.index

Index(['apple', 'banana', ' peach'], dtype='object')

### Series- Changement of index

In [25]:

serie1


apple     2
banana    4
 peach    6
dtype: int64

In [26]:
serie1.index = ["app", "ban", "pea"]
serie1

app    2
ban    4
pea    6
dtype: int64

## 2.DataFrame (DF)

### DF - Construct a DataFrame - with dict

In [27]:
data = {"state" : ["Ohio", "Ohio", "Nevada", "Nevada"],
        "year"  : ["2000", "2001", "2002", "2003"],
        "popul"  : [1.5, 1.7, 3.6, 2.4]}
data

{'state': ['Ohio', 'Ohio', 'Nevada', 'Nevada'],
 'year': ['2000', '2001', '2002', '2003'],
 'popul': [1.5, 1.7, 3.6, 2.4]}

In [28]:
frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,popul
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Nevada,2002,3.6
3,Nevada,2003,2.4


In [29]:
frame2 = pd.DataFrame(data, columns=["state", "year", "area"])
frame2

Unnamed: 0,state,year,area
0,Ohio,2000,
1,Ohio,2001,
2,Nevada,2002,
3,Nevada,2003,


DF - Construct a DataFrame - with nested dict¶

In [30]:
data = {"state" : {"2000" : "Ohio", "2001" : "Ohio", "2002" : "Nevada", "2003":"Nevada"},

        "popul" : {"2000" : 1.5, "2001" : 1.7}}
frame2 = pd.DataFrame(data)
frame2

Unnamed: 0,state,popul
2000,Ohio,1.5
2001,Ohio,1.7
2002,Nevada,
2003,Nevada,


### DF- Select Series from DF

In [31]:
frame["state"]

0      Ohio
1      Ohio
2    Nevada
3    Nevada
Name: state, dtype: object

In [32]:
frame.year

0    2000
1    2001
2    2002
3    2003
Name: year, dtype: object

### DF- Adding new column

#### Adding new column by list

In [33]:
frame["new_by_list"] = [3, 4, 5, 2]
frame

Unnamed: 0,state,year,popul,new_by_list
0,Ohio,2000,1.5,3
1,Ohio,2001,1.7,4
2,Nevada,2002,3.6,5
3,Nevada,2003,2.4,2


* If you assign a new column by a list, the number of item must be same with the DataFrame's index number


#### Adding new column by Series

In [34]:
obj = pd.Series([0,0,0,0], index=[0,1,2,4])

In [35]:
frame["new_by_Series"] = obj
frame

Unnamed: 0,state,year,popul,new_by_list,new_by_Series
0,Ohio,2000,1.5,3,0.0
1,Ohio,2001,1.7,4,0.0
2,Nevada,2002,3.6,5,0.0
3,Nevada,2003,2.4,2,


* If you assign a new column by a Series, the index values should be same with the DataFrame's index values
* Otherwise it will be a NaN value

### DF- del DF column

In [36]:
del frame["new_by_list"]

In [37]:
frame


Unnamed: 0,state,year,popul,new_by_Series
0,Ohio,2000,1.5,0.0
1,Ohio,2001,1.7,0.0
2,Nevada,2002,3.6,0.0
3,Nevada,2003,2.4,


* We can del only one column at once
* If you want to del more columns at once you can use drop() method of DataFrame class

### DF- del DF column or index by drop() method

In [38]:
dt4 = dt.head()
dt4

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [39]:
dt4.drop(["Name", "Ticket", "Embarked"], axis="columns")


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin
0,1,0,3,male,22.0,1,0,7.25,
1,2,1,1,female,38.0,1,0,71.2833,C85
2,3,1,3,female,26.0,0,0,7.925,
3,4,1,1,female,35.0,1,0,53.1,C123
4,5,0,3,male,35.0,0,0,8.05,


* As before drop() method has a default axis value as 0 (rows) 

In [40]:
dt4.drop([2, 4])

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S


### DF- Transpose of DF

In [41]:
frame

Unnamed: 0,state,year,popul,new_by_Series
0,Ohio,2000,1.5,0.0
1,Ohio,2001,1.7,0.0
2,Nevada,2002,3.6,0.0
3,Nevada,2003,2.4,


In [42]:
frame.T

Unnamed: 0,0,1,2,3
state,Ohio,Ohio,Nevada,Nevada
year,2000,2001,2002,2003
popul,1.5,1.7,3.6,2.4
new_by_Series,0,0,0,


### DF- Selecting a mini DF

In [43]:
frame

Unnamed: 0,state,year,popul,new_by_Series
0,Ohio,2000,1.5,0.0
1,Ohio,2001,1.7,0.0
2,Nevada,2002,3.6,0.0
3,Nevada,2003,2.4,


In [44]:
data2 = {"year" : frame["year"][0:2],
         "popul": frame["popul"][frame["popul"]<3]}

In [45]:
frame3 = pd.DataFrame(data2)
frame3

Unnamed: 0,year,popul
0,2000.0,1.5
1,2001.0,1.7
3,,2.4


### DF- Index Object

In [46]:
inx_columns = frame3.columns
inx_columns


Index(['year', 'popul'], dtype='object')

* Columns and index'es are instances of pd.Index class
* Index objects are immutable like tuples

In [47]:
inx2 = pd.Index(['10','20', '30', '40'])


In [48]:
inx2[0]

'10'

In [49]:
#inx[0] = "30"

* It will throw an error because Index objects are immutable like tuples
* It means you can not make any changement over an index object

## 3.Essential Functionality (EF)

### EF - Setindex() method

* To choose an existing column as index

In [50]:
efs = dt4.copy()
efs


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [51]:
efs.set_index("PassengerId")

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### EF - Reindex() method

* Reindexing is not used to change index values
* it is used to reorder indexes

In [52]:
serie


a    32
b    12
c    43
d    23
dtype: int64

In [53]:
serie.reindex(["b", "c", "a", "d"])

b    12
c    43
a    32
d    23
dtype: int64

* If there is not any matching index it assign a NaN value

In [54]:
serie.reindex(["b", "c", "a", "p"]) 

b    12.0
c    43.0
a    32.0
p     NaN
dtype: float64

In [55]:
frame

Unnamed: 0,state,year,popul,new_by_Series
0,Ohio,2000,1.5,0.0
1,Ohio,2001,1.7,0.0
2,Nevada,2002,3.6,0.0
3,Nevada,2003,2.4,


In [56]:
frame.reindex([2,3,1,0])

Unnamed: 0,state,year,popul,new_by_Series
2,Nevada,2002,3.6,0.0
3,Nevada,2003,2.4,
1,Ohio,2001,1.7,0.0
0,Ohio,2000,1.5,0.0


### EF - Reindexing - reorder Columns with axis = "columns"

In [57]:
frame.reindex(["popul", "state", "year"], axis = "columns")

Unnamed: 0,popul,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Nevada,2002
3,2.4,Nevada,2003


* It droped automaticaly unassigned columns values 

### EF - Dropping Entries from an Axis

* Drop method is using for deleting spesific entries
* Just as reindex method drop method has an parameter of axis info

In [58]:
frame

Unnamed: 0,state,year,popul,new_by_Series
0,Ohio,2000,1.5,0.0
1,Ohio,2001,1.7,0.0
2,Nevada,2002,3.6,0.0
3,Nevada,2003,2.4,


In [59]:
frame.drop([0,1]) # It didnt change real dF because it is a copy of the frame

Unnamed: 0,state,year,popul,new_by_Series
2,Nevada,2002,3.6,0.0
3,Nevada,2003,2.4,


In [60]:
frame.drop(["state","year"], axis="columns")

Unnamed: 0,popul,new_by_Series
0,1.5,0.0
1,1.7,0.0
2,3.6,0.0
3,2.4,


In [61]:
frame

Unnamed: 0,state,year,popul,new_by_Series
0,Ohio,2000,1.5,0.0
1,Ohio,2001,1.7,0.0
2,Nevada,2002,3.6,0.0
3,Nevada,2003,2.4,


### EF - Slicing with Series, Selection and Filtering

#### EF - Slicing with Series

In [62]:
frame.index = inx2
frame

Unnamed: 0,state,year,popul,new_by_Series
10,Ohio,2000,1.5,0.0
20,Ohio,2001,1.7,0.0
30,Nevada,2002,3.6,0.0
40,Nevada,2003,2.4,


* End point is exclusive

In [63]:
frame["year"][0:3]

10    2000
20    2001
30    2002
Name: year, dtype: object

* At the Slicing with label end point is inclusive

In [64]:
frame["year"]["10":"30"]

10    2000
20    2001
30    2002
Name: year, dtype: object

* We can assign new values with slicing

In [65]:
frame["year"]["10":"30"] = ["3000", "3001", "3002"]
frame


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,state,year,popul,new_by_Series
10,Ohio,3000,1.5,0.0
20,Ohio,3001,1.7,0.0
30,Nevada,3002,3.6,0.0
40,Nevada,2003,2.4,


* It throwed a warning but did what we command and change values sliced
* But if you want to change more than one column it wont work with slice
* We should use .loc and .iloc special indexing operators

### EF - Selecting with loc and iloc

* with loc spesific location by index and column names
* with iloc spesific location by index and column index

In [66]:
frame

Unnamed: 0,state,year,popul,new_by_Series
10,Ohio,3000,1.5,0.0
20,Ohio,3001,1.7,0.0
30,Nevada,3002,3.6,0.0
40,Nevada,2003,2.4,


* loc

In [67]:
frame.loc["10" : "40", "state" : "popul" ]

Unnamed: 0,state,year,popul
10,Ohio,3000,1.5
20,Ohio,3001,1.7
30,Nevada,3002,3.6
40,Nevada,2003,2.4


In [68]:
frame.loc[["10", "40"], ["state", "popul"]]

Unnamed: 0,state,popul
10,Ohio,1.5
40,Nevada,2.4


* iloc

In [69]:
frame.iloc[[0,3], [0, 2]]

Unnamed: 0,state,popul
10,Ohio,1.5
40,Nevada,2.4


In [70]:
frame.iloc[0:3, 0:2]

Unnamed: 0,state,year
10,Ohio,3000
20,Ohio,3001
30,Nevada,3002


#### Example

In [71]:
frame

Unnamed: 0,state,year,popul,new_by_Series
10,Ohio,3000,1.5,0.0
20,Ohio,3001,1.7,0.0
30,Nevada,3002,3.6,0.0
40,Nevada,2003,2.4,


In [72]:
frame.iloc[:, 0:3][frame.popul <2.3]

Unnamed: 0,state,year,popul
10,Ohio,3000,1.5
20,Ohio,3001,1.7


### EF - Finding with isin() function

In [73]:
frame2 = frame.copy()
frame2

Unnamed: 0,state,year,popul,new_by_Series
10,Ohio,3000,1.5,0.0
20,Ohio,3001,1.7,0.0
30,Nevada,3002,3.6,0.0
40,Nevada,2003,2.4,


In [74]:
#frame.index.rename("sayi")

In [75]:
frame2

Unnamed: 0,state,year,popul,new_by_Series
10,Ohio,3000,1.5,0.0
20,Ohio,3001,1.7,0.0
30,Nevada,3002,3.6,0.0
40,Nevada,2003,2.4,


In [76]:
frame2.state.isin(["Nevada"])

10    False
20    False
30     True
40     True
Name: state, dtype: bool

* Upper and bottom side both return same
* isin() function is useful when you select more than one type of value from  a column

In [77]:
frame2.state == "Nevada"

10    False
20    False
30     True
40     True
Name: state, dtype: bool

### EF - Arithmetic

* Arithmetic operations are conducted with index based
* Like reindex operations if there is not any match index, pandas assign a NaN value

In [78]:
df1 = pd.DataFrame(np.arange(9.).reshape(3,3), index = list("abc"), columns = ["Car", "Bicycle", "Plain"])
df2 = pd.DataFrame(np.arange(16.).reshape(4,4), index = list("abcd"), columns = ["Car", "Bicycle", "Motor","Plain"])
                                                                               

In [79]:
df1

Unnamed: 0,Car,Bicycle,Plain
a,0.0,1.0,2.0
b,3.0,4.0,5.0
c,6.0,7.0,8.0


In [80]:
df2

Unnamed: 0,Car,Bicycle,Motor,Plain
a,0.0,1.0,2.0,3.0
b,4.0,5.0,6.0,7.0
c,8.0,9.0,10.0,11.0
d,12.0,13.0,14.0,15.0


In [81]:
df6 = df1 + df2
df6

Unnamed: 0,Bicycle,Car,Motor,Plain
a,2.0,0.0,,5.0
b,9.0,7.0,,12.0
c,16.0,14.0,,19.0
d,,,,


* DataFrame() class has some methods for arithmetic operations like x.add(y)
* add method has an fill_value parameter for NaN values in dataframes **NOT FOR NaN VALUES İN RESULT FRAME 

In [82]:
df3 = pd.DataFrame({"A" : [1,3,4],
                    "B" : [3,5,6]})
df4 = pd.DataFrame({"A" : [42,65,33],
                    "B" : [11,22,17]})

In [83]:
df3.iloc[[1,2], 1] = np.nan
df3

Unnamed: 0,A,B
0,1,3.0
1,3,
2,4,


In [84]:
df4

Unnamed: 0,A,B
0,42,11
1,65,22
2,33,17


In [85]:
df3.add(df4, fill_value = 0)


Unnamed: 0,A,B
0,43,14.0
1,68,22.0
2,37,17.0


### EF - Function Application and Mapping

In [86]:
df4

Unnamed: 0,A,B
0,42,11
1,65,22
2,33,17


In [87]:
f = lambda x : max(x) 
df4.apply(f)

A    65
B    22
dtype: int64

In [88]:
df4.apply(f, axis="columns")

0    42
1    65
2    33
dtype: int64

In [89]:
df5 = pd.DataFrame(np.random.randn(3,3), index = list("abc"), columns = ["Car", "Bicycle", "Plain"])
df5

Unnamed: 0,Car,Bicycle,Plain
a,0.948161,-0.176724,-0.805077
b,0.84006,0.138578,-0.692536
c,-0.52794,0.445401,-0.12348


In [90]:
format = lambda x: '%.2f' % x
df5.applymap(format)

Unnamed: 0,Car,Bicycle,Plain
a,0.95,-0.18,-0.81
b,0.84,0.14,-0.69
c,-0.53,0.45,-0.12


### EF - Sorting Series and DataFrames

#### EF - Sorting - .sort_index( , ascending = True)

In [91]:
serie1


app    2
ban    4
pea    6
dtype: int64

In [92]:
serie1.sort_index()

app    2
ban    4
pea    6
dtype: int64

In [93]:
serie1.sort_index(ascending=False)

pea    6
ban    4
app    2
dtype: int64

In [94]:
frame4 = frame
frame4.columns = list("bcad")
frame4.index = [30, 40, 50, 10]
frame4

Unnamed: 0,b,c,a,d
30,Ohio,3000,1.5,0.0
40,Ohio,3001,1.7,0.0
50,Nevada,3002,3.6,0.0
10,Nevada,2003,2.4,


In [95]:
frame4.sort_index()

Unnamed: 0,b,c,a,d
10,Nevada,2003,2.4,
30,Ohio,3000,1.5,0.0
40,Ohio,3001,1.7,0.0
50,Nevada,3002,3.6,0.0


In [96]:
frame4.sort_index(axis="columns")

Unnamed: 0,a,b,c,d
30,1.5,Ohio,3000,0.0
40,1.7,Ohio,3001,0.0
50,3.6,Nevada,3002,0.0
10,2.4,Nevada,2003,


#### EF - Sorting - .sort_values( , ascending = True)

In [97]:
frame4.sort_values(by="a", ascending=False)

Unnamed: 0,b,c,a,d
50,Nevada,3002,3.6,0.0
10,Nevada,2003,2.4,
40,Ohio,3001,1.7,0.0
30,Ohio,3000,1.5,0.0


In [98]:
frame4.loc[[10,30], "a"] = [3.6, 1.7]
frame4.sort_values(by="a", ascending=False)

Unnamed: 0,b,c,a,d
50,Nevada,3002,3.6,0.0
10,Nevada,2003,3.6,
30,Ohio,3000,1.7,0.0
40,Ohio,3001,1.7,0.0


In [99]:
frame4.sort_values(by=["a", "c"], ascending=True)

Unnamed: 0,b,c,a,d
30,Ohio,3000,1.7,0.0
40,Ohio,3001,1.7,0.0
10,Nevada,2003,3.6,
50,Nevada,3002,3.6,0.0


## 4.Data Preprocessing (DP)

### DP - Missing Data

* dropna()
* fillna()
* isnull()
* notnull()
* to_numeric()

#### DP - Missing Data - dropna()

In [100]:
frame5 = df6.copy()
frame5


Unnamed: 0,Bicycle,Car,Motor,Plain
a,2.0,0.0,,5.0
b,9.0,7.0,,12.0
c,16.0,14.0,,19.0
d,,,,


In [101]:
frame5["Plain"].isnull()

a    False
b    False
c    False
d     True
Name: Plain, dtype: bool

In [102]:
frame5.isnull()


Unnamed: 0,Bicycle,Car,Motor,Plain
a,False,False,True,False
b,False,False,True,False
c,False,False,True,False
d,True,True,True,True


In [103]:
frame5["Plain"].notnull()

a     True
b     True
c     True
d    False
Name: Plain, dtype: bool

In [104]:
frame5.loc["a","Motor"] = 3.0
frame5.loc["c","Car"] = np.nan
frame5


Unnamed: 0,Bicycle,Car,Motor,Plain
a,2.0,0.0,3.0,5.0
b,9.0,7.0,,12.0
c,16.0,,,19.0
d,,,,


* dropna() drop all rows which includes NaN values

In [105]:
frame5.dropna()

Unnamed: 0,Bicycle,Car,Motor,Plain
a,2.0,0.0,3.0,5.0


* dropna(how="all") Drop rows just has all value NaN type

In [106]:
frame5.dropna(how="all")

Unnamed: 0,Bicycle,Car,Motor,Plain
a,2.0,0.0,3.0,5.0
b,9.0,7.0,,12.0
c,16.0,,,19.0


In [107]:
frame5.dropna(axis="columns", how="all")

Unnamed: 0,Bicycle,Car,Motor,Plain
a,2.0,0.0,3.0,5.0
b,9.0,7.0,,12.0
c,16.0,,,19.0
d,,,,


* dropna(thresh=3) means:
* To survive, rows should have real values at least thresh value (2 for this example)

In [108]:
frame5.dropna(thresh=3)

Unnamed: 0,Bicycle,Car,Motor,Plain
a,2.0,0.0,3.0,5.0
b,9.0,7.0,,12.0


* dropna(thresh=2, axis=1) means:
* min real value count to survive columns (2 for this example)

In [109]:
frame5.dropna(thresh=2, axis=1)

Unnamed: 0,Bicycle,Car,Plain
a,2.0,0.0,5.0
b,9.0,7.0,12.0
c,16.0,,19.0
d,,,


#### DP - Missing Data - fillna()

In [110]:
frame5

Unnamed: 0,Bicycle,Car,Motor,Plain
a,2.0,0.0,3.0,5.0
b,9.0,7.0,,12.0
c,16.0,,,19.0
d,,,,


In [111]:
frame5.fillna(0)

Unnamed: 0,Bicycle,Car,Motor,Plain
a,2.0,0.0,3.0,5.0
b,9.0,7.0,0.0,12.0
c,16.0,0.0,0.0,19.0
d,0.0,0.0,0.0,0.0


* Spesific columns with dictionary

In [112]:
frame5.fillna({"Bicycle": 0.001, "Motor": 0.002})

Unnamed: 0,Bicycle,Car,Motor,Plain
a,2.0,0.0,3.0,5.0
b,9.0,7.0,0.002,12.0
c,16.0,,0.002,19.0
d,0.001,,0.002,


In [113]:
temp_frame = frame5.copy()
temp_frame.fillna(0, inplace=True)

In [114]:
temp_frame

Unnamed: 0,Bicycle,Car,Motor,Plain
a,2.0,0.0,3.0,5.0
b,9.0,7.0,0.0,12.0
c,16.0,0.0,0.0,19.0
d,0.0,0.0,0.0,0.0


In [115]:
frame5

Unnamed: 0,Bicycle,Car,Motor,Plain
a,2.0,0.0,3.0,5.0
b,9.0,7.0,,12.0
c,16.0,,,19.0
d,,,,


* fillna(method="ffill") : Last real value in column will takes place of NaN value

In [116]:
frame5.fillna(method="ffill")

Unnamed: 0,Bicycle,Car,Motor,Plain
a,2.0,0.0,3.0,5.0
b,9.0,7.0,3.0,12.0
c,16.0,7.0,3.0,19.0
d,16.0,7.0,3.0,19.0


In [117]:
frame5.fillna(method="ffill", limit=2)

Unnamed: 0,Bicycle,Car,Motor,Plain
a,2.0,0.0,3.0,5.0
b,9.0,7.0,3.0,12.0
c,16.0,7.0,3.0,19.0
d,16.0,7.0,,19.0


In [118]:
frame5

Unnamed: 0,Bicycle,Car,Motor,Plain
a,2.0,0.0,3.0,5.0
b,9.0,7.0,,12.0
c,16.0,,,19.0
d,,,,


In [119]:
frame5.fillna(frame5["Car"].mean())


Unnamed: 0,Bicycle,Car,Motor,Plain
a,2.0,0.0,3.0,5.0
b,9.0,7.0,3.5,12.0
c,16.0,3.5,3.5,19.0
d,3.5,3.5,3.5,3.5


* How can we choose spesific column for fill with a method like mean???
* The answer is below

In [120]:
frame5.fillna({"Car": frame5["Car"].mean()})

Unnamed: 0,Bicycle,Car,Motor,Plain
a,2.0,0.0,3.0,5.0
b,9.0,7.0,,12.0
c,16.0,3.5,,19.0
d,,3.5,,


#### DP - Missing Data - to_numeric()

In [121]:
frame5

Unnamed: 0,Bicycle,Car,Motor,Plain
a,2.0,0.0,3.0,5.0
b,9.0,7.0,,12.0
c,16.0,,,19.0
d,,,,


In [122]:
pd.to_numeric(frame5["Car"])

a    0.0
b    7.0
c    NaN
d    NaN
Name: Car, dtype: float64

### DP - Column Name Arrangements (CNA)

#### DP - CNA- Upper lower case

In [123]:
dt3 = dt.copy() 

In [124]:

dt3.columns = ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
dt3

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [125]:
dt3.columns = [each.lower() for each in dt3.columns]
dt3

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


#### DP - CNA- Space between words

In [126]:
dt3.columns = [each.split()[0] + "_" + each.split()[1] if len(each.split())>1 else each for each in dt3.columns]

In [127]:
dt3

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


### DP - Concatinating

In [128]:
concat1 = frame.copy()
concat2 = frame2

concat1.index = concat2.index

concat1

Unnamed: 0,b,c,a,d
10,Ohio,3000,1.7,0.0
20,Ohio,3001,1.7,0.0
30,Nevada,3002,3.6,0.0
40,Nevada,2003,3.6,


In [129]:
concat2

Unnamed: 0,state,year,popul,new_by_Series
10,Ohio,3000,1.5,0.0
20,Ohio,3001,1.7,0.0
30,Nevada,3002,3.6,0.0
40,Nevada,2003,2.4,


In [130]:
pd.concat([concat1,concat2], axis="columns")

Unnamed: 0,b,c,a,d,state,year,popul,new_by_Series
10,Ohio,3000,1.7,0.0,Ohio,3000,1.5,0.0
20,Ohio,3001,1.7,0.0,Ohio,3001,1.7,0.0
30,Nevada,3002,3.6,0.0,Nevada,3002,3.6,0.0
40,Nevada,2003,3.6,,Nevada,2003,2.4,


pd.concat([concat1,concat2], axis="rows")

In [131]:
df2

Unnamed: 0,Car,Bicycle,Motor,Plain
a,0.0,1.0,2.0,3.0
b,4.0,5.0,6.0,7.0
c,8.0,9.0,10.0,11.0
d,12.0,13.0,14.0,15.0


### DP - Train Test Splitting

In [132]:
df_split = pd.DataFrame({'num_legs': [2, 4, 8, 0, 2, 4],
                   'num_wings': [2, 0, 0, 0, 0, 0],
                   'num_specimen_seen': [10, 2, 1, 8, 1, 12]},
                  index=['falcon', 'dog', 'spider', 'fish', 'human', 'horse'])

In [133]:
df_split

Unnamed: 0,num_legs,num_wings,num_specimen_seen
falcon,2,2,10
dog,4,0,2
spider,8,0,1
fish,0,0,8
human,2,0,1
horse,4,0,12


In [134]:
df_split_train = df_split.sample(frac=0.3, replace=False, random_state=1)

* It returns %30 sample of main dataframe randomly

In [135]:
df_split_train

Unnamed: 0,num_legs,num_wings,num_specimen_seen
spider,8,0,1
dog,4,0,2


* We can randomly sample just for a future by using sample function to a serie 
* The value "n" represents the exact number of samples to be taken.
* You can use "frac" or "n" for determining the amount of sample to be taken

In [136]:
df_split.num_legs.sample(n=3, random_state=1 )

spider    8
dog       4
human     2
Name: num_legs, dtype: int64

### DP - String Operations

In [173]:
dic_string = {"state" : ["Ohio(2000)", "Ohio(2001)", "Nevada(2002)", "Nevada(2001)"],
        "popul"  : [1.5, 1.7, 3.6, 2.4]}

In [174]:
df_string = pd.DataFrame(dic_string)
df_string

Unnamed: 0,state,popul
0,Ohio(2000),1.5
1,Ohio(2001),1.7
2,Nevada(2002),3.6
3,Nevada(2001),2.4


* We want to delete them from state column and add them under a new column as year.

In [175]:
type(df_string.state.str.extract('(\(\d\d\d\d\))',expand=True))


pandas.core.frame.DataFrame

In [176]:
type(df_string.state.str.extract('(\(\d\d\d\d\))',expand=False))

pandas.core.series.Series

In [177]:
df_string["years"] = df_string.state.str.extract("(\(\d\d\d\d\))", expand=False)
df_string

Unnamed: 0,state,popul,years
0,Ohio(2000),1.5,(2000)
1,Ohio(2001),1.7,(2001)
2,Nevada(2002),3.6,(2002)
3,Nevada(2001),2.4,(2001)


Let's get rid of Parentheses 

In [178]:
df_string["years"] = df_string.state.str.extract("(\d\d\d\d)", expand=False)
df_string

Unnamed: 0,state,popul,years
0,Ohio(2000),1.5,2000
1,Ohio(2001),1.7,2001
2,Nevada(2002),3.6,2002
3,Nevada(2001),2.4,2001


Let's get rid of years in the state column

In [179]:
df_string["state"] = df_string["state"].str.replace("(\(\d\d\d\d\))","")
df_string.head()

Unnamed: 0,state,popul,years
0,Ohio,1.5,2000
1,Ohio,1.7,2001
2,Nevada,3.6,2002
3,Nevada,2.4,2001


In [180]:
a = df_string["state"].values[0]
type(a)

str

In [181]:
list(a)

['O', 'h', 'i', 'o']

Discard the last space we added

In [159]:
df_string["state"]  = df_string["state"] .apply(lambda x: x.strip())
df_string.head()

Unnamed: 0,state,popul,years
0,Ohio,1.5,2000
1,Ohio,1.7,2001
2,Nevada,3.6,2002
3,Nevada,2.4,2001


In [203]:
for i,r in df_string.iterrows():
    print(r['popul'])
    print(" ")


1.5
 
1.7
 
3.6
 
2.4
 


In [204]:
df_string.at[0, "abc"] = 1

In [205]:
df_string.head()

Unnamed: 0,state,popul,years,abc
0,Ohio,1.5,2000,1.0
1,Ohio,1.7,2001,
2,Nevada,3.6,2002,
3,Nevada,2.4,2001,


## 4.Explorotory Data Analysis (EDA)

### EDA - value_counts()

In [None]:

dt3


* How many people rescued any how many people died in that accident.
* We can find the answer by value_counts() method

In [None]:
survived = dt3["survived"].value_counts(dropna=False)
print(str(survived[0]) + " person did not survive at the accident.")
print(str(survived[1]) + " person survived at the accident.")

In [None]:
dt3.describe()

In [None]:
dt3.age.max()

In [None]:
dt3.age.min()

In [None]:
dt3.boxplot(column="fare", by= "survived")

In [None]:
dt3.head()

In [None]:
dt3_melted = pd.melt(dt3.iloc[:,1:], id_vars="survived",var_name="features", value_name="values").sort_values('survived')
dt3_melted.head()

In [None]:
frame4["e"] =[0,0,0,0]
frame4.iloc[1,1] = 3000
frame4

In [None]:
frame4.duplicated()