In [2]:
import pandas as pd

In [3]:
titanic = pd.read_csv("titanic.csv")

#  Filter Columns

In [4]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,


In [5]:
male_column = titanic.loc[titanic.sex == "male"]

In [6]:
male_column.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
4,0,3,male,35.0,0,0,8.05,S,
5,0,3,male,,0,0,8.4583,Q,
6,0,1,male,54.0,0,0,51.8625,S,E
7,0,3,male,2.0,3,1,21.075,S,


In [7]:
filter_columns = male_column.columns[2:4]

In [8]:
male_column.loc[:, filter_columns]

Unnamed: 0,sex,age
0,male,22.0
4,male,35.0
5,male,
6,male,54.0
7,male,2.0
...,...,...
883,male,28.0
884,male,25.0
886,male,27.0
889,male,26.0


# Filtering DataFrames with many Conditions (AND | &)

In [9]:
means_1 = titanic.sex == 'male'
means_1.head()

0     True
1    False
2    False
3    False
4     True
Name: sex, dtype: bool

In [10]:
means_2 = titanic.age > 14
means_2.head()

0    True
1    True
2    True
3    True
4    True
Name: age, dtype: bool

In [11]:
result_survival_man = titanic.loc[means_1 & means_2, ['survived', 'pclass', 'sex', 'age']]
result_survival_man.head(10)

Unnamed: 0,survived,pclass,sex,age
0,0,3,male,22.0
4,0,3,male,35.0
6,0,1,male,54.0
12,0,3,male,20.0
13,0,3,male,39.0
20,0,2,male,35.0
21,1,2,male,34.0
23,1,1,male,28.0
27,0,1,male,19.0
30,0,1,male,40.0


# Filtering DF with many Conditions (OR  | )

## One Condition need True

In [12]:
woman = titanic.sex == "female"
woman.head()

0    False
1     True
2     True
3     True
4    False
Name: sex, dtype: bool

In [13]:
child = titanic.age < 14
child.head()

0    False
1    False
2    False
3    False
4    False
Name: age, dtype: bool

In [14]:
woman_and_child_survival = titanic.loc[woman & child, ['sex', 'age', 'survived']]
woman_and_child_survival.head(10)

Unnamed: 0,sex,age,survived
10,female,4.0,1
24,female,8.0,0
43,female,3.0,1
58,female,5.0,1
119,female,2.0,0
147,female,9.0,0
172,female,1.0,1
184,female,4.0,1
205,female,2.0,0
233,female,5.0,1


In [15]:
woman_and_child_survival.describe()

Unnamed: 0,age,survived
count,34.0,34.0
mean,5.279412,0.617647
std,3.522527,0.49327
min,0.75,0.0
25%,2.0,0.0
50%,4.5,1.0
75%,8.0,1.0
max,13.0,1.0


In [16]:
woman_and_child_survival.info()

<class 'pandas.core.frame.DataFrame'>
Index: 34 entries, 10 to 852
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   sex       34 non-null     object 
 1   age       34 non-null     float64
 2   survived  34 non-null     int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 1.1+ KB


# Advanced Filtering with between(), isin() and ~

## ~ use for NOT condition

In [17]:
summer = pd.read_csv("summer.csv")

### use normal between() return bool data
### between() use for filter 2 value

In [18]:
summer.Year.between(2004, 2008)

0        False
1        False
2        False
3        False
4        False
         ...  
31160    False
31161    False
31162    False
31163    False
31164    False
Name: Year, Length: 31165, dtype: bool

In [19]:
years2008 = summer.loc[summer.Year.between(2004,2008)]
years2008.head()

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
25176,2004,Athens,Aquatics,Diving,"TIAN, Liang",CHN,Men,10M Platform,Bronze
25177,2004,Athens,Aquatics,Diving,"HU, Jia",CHN,Men,10M Platform,Gold
25178,2004,Athens,Aquatics,Diving,"HELM, Mathew",AUS,Men,10M Platform,Silver
25179,2004,Athens,Aquatics,Diving,"TOURKY, Loudy",AUS,Women,10M Platform,Bronze
25180,2004,Athens,Aquatics,Diving,"NEWBERY, Chantelle",AUS,Women,10M Platform,Gold


### use isin() use for filter many value

In [20]:
favorite_country = ["USA","ROM"]
summer.loc[summer.Country.isin(favorite_country)].head()

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
11,1896,Athens,Athletics,Athletics,"LANE, Francis",USA,Men,100M,Bronze
13,1896,Athens,Athletics,Athletics,"BURKE, Thomas",USA,Men,100M,Gold
15,1896,Athens,Athletics,Athletics,"CURTIS, Thomas",USA,Men,110M Hurdles,Gold
19,1896,Athens,Athletics,Athletics,"BLAKE, Arthur",USA,Men,1500M,Silver
21,1896,Athens,Athletics,Athletics,"BURKE, Thomas",USA,Men,400M,Gold


In [21]:
summer.Country.unique()

array(['HUN', 'AUT', 'GRE', 'USA', 'GER', 'GBR', 'FRA', 'AUS', 'DEN',
       'SUI', 'ZZX', 'NED', 'BEL', 'IND', 'CAN', 'BOH', 'SWE', 'NOR',
       'ESP', 'ITA', 'CUB', 'ANZ', 'RSA', 'FIN', 'RU1', 'EST', 'TCH',
       'NZL', 'BRA', 'JPN', 'LUX', 'ARG', 'POL', 'POR', 'URU', 'YUG',
       'ROU', 'HAI', 'EGY', 'PHI', 'IRL', 'CHI', 'LAT', 'MEX', 'TUR',
       'PAN', 'JAM', 'SRI', 'KOR', 'PUR', 'PER', 'IRI', 'TRI', 'URS',
       'VEN', 'BUL', 'LIB', 'EUA', 'ISL', 'PAK', 'BAH', 'BWI', 'TPE',
       'ETH', 'MAR', 'GHA', 'IRQ', 'SIN', 'TUN', 'KEN', 'NGR', 'GDR',
       'FRG', 'UGA', 'CMR', 'MGL', 'PRK', 'COL', 'NIG', 'THA', 'BER',
       'TAN', 'GUY', 'ZIM', 'CHN', 'CIV', 'ZAM', 'DOM', 'ALG', 'SYR',
       'SUR', 'CRC', 'INA', 'SEN', 'DJI', 'AHO', 'ISV', 'EUN', 'NAM',
       'QAT', 'LTU', 'MAS', 'CRO', 'ISR', 'SLO', 'IOP', 'RUS', 'UKR',
       'ECU', 'BDI', 'MOZ', 'CZE', 'BLR', 'TGA', 'KAZ', 'UZB', 'SVK',
       'MDA', 'GEO', 'HKG', 'ARM', 'AZE', 'BAR', 'KSA', 'KGZ', 'KUW',
       'VIE', 'MKD',

### use ~ for NOT condition

In [22]:
not_country_2008 = summer.loc[~summer.Year.between(2004,2008)]
not_country_2008.head()

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
0,1896,Athens,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100M Freestyle,Gold
1,1896,Athens,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100M Freestyle,Silver
2,1896,Athens,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100M Freestyle For Sailors,Bronze
3,1896,Athens,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100M Freestyle For Sailors,Gold
4,1896,Athens,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100M Freestyle For Sailors,Silver


# Any() and All()
## any() return True if one value is True
## all() return True if all value is True

In [23]:
(titanic.sex == "male").any()

True

In [24]:
(titanic.sex == "male").all()

False

#### Need () for condition

In [25]:
(titanic.age == 100).any()

False

# Removing Columns

In [26]:
titanic_v1 = titanic.copy()
titanic_v1.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,


In [27]:
titanic_v1.drop(columns=["pclass", "sibsp", "parch","embarked", "deck"], inplace=True)
titanic_v1.head()

Unnamed: 0,survived,sex,age,fare
0,0,male,22.0,7.25
1,1,female,38.0,71.2833
2,1,female,26.0,7.925
3,1,female,35.0,53.1
4,0,male,35.0,8.05


In [28]:
titanic_v2 = titanic.copy()
titanic_v2.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,


In [29]:
select_col = titanic_v2.columns[1:5]
select_col

Index(['pclass', 'sex', 'age', 'sibsp'], dtype='object')

In [30]:
titanic_v2.drop(columns=select_col, inplace=True)
titanic_v2.head()

Unnamed: 0,survived,parch,fare,embarked,deck
0,0,0,7.25,S,
1,1,0,71.2833,C,C
2,1,0,7.925,S,
3,1,0,53.1,S,C
4,0,0,8.05,S,


# Removing Rows

In [31]:
cars = pd.read_csv("cars.csv")

In [32]:
cars.drop(columns=["mpg","cylinders","displacement","horsepower","weight","acceleration","model_year"], inplace=True)

In [33]:
cars.head()

Unnamed: 0,origin,name
0,usa,chevrolet chevelle malibu
1,usa,buick skylark 320
2,usa,plymouth satellite
3,usa,amc rebel sst
4,usa,ford torino


In [34]:
cars_filter = cars.loc[cars.name == "chevrolet chevelle malibu"]

In [35]:
cars_filter.drop(index=0)

Unnamed: 0,origin,name
35,usa,chevrolet chevelle malibu
161,usa,chevrolet chevelle malibu


### labels + axis=1 inseamna ca vom specifica coloanele care vrem sa le stergem

In [36]:
cars_filter.drop(labels="name", axis=1)

Unnamed: 0,origin
0,usa
35,usa
161,usa


### labels + axis=0 inseamna ca vom specifica randurile care vrem sa le stergem - in cazul nostru randul 1

In [37]:
cars.drop(labels=1, axis=0)

Unnamed: 0,origin,name
0,usa,chevrolet chevelle malibu
2,usa,plymouth satellite
3,usa,amc rebel sst
4,usa,ford torino
5,usa,ford galaxie 500
...,...,...
393,usa,ford mustang gl
394,europe,vw pickup
395,usa,dodge rampage
396,usa,ford ranger


# Adding new Columns
## Cu [] adaugam o coloana noua sau acesam o coloana existenta

In [38]:
titanic["T_T_t"] = 0
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck,T_T_t
0,0,3,male,22.0,1,0,7.25,S,,0
1,1,1,female,38.0,1,0,71.2833,C,C,0
2,1,3,female,26.0,0,0,7.925,S,,0
3,1,1,female,35.0,1,0,53.1,S,C,0
4,0,3,male,35.0,0,0,8.05,S,,0


# Creating new Columns based on other Columns

In [39]:
titanic['inflation'] = titanic.fare * 10
titanic.drop(columns=["deck","pclass","T_T_t", "embarked"])

Unnamed: 0,survived,sex,age,sibsp,parch,fare,inflation
0,0,male,22.0,1,0,7.2500,72.500
1,1,female,38.0,1,0,71.2833,712.833
2,1,female,26.0,0,0,7.9250,79.250
3,1,female,35.0,1,0,53.1000,531.000
4,0,male,35.0,0,0,8.0500,80.500
...,...,...,...,...,...,...,...
886,0,male,27.0,0,0,13.0000,130.000
887,1,female,19.0,0,0,30.0000,300.000
888,0,female,,1,2,23.4500,234.500
889,1,male,26.0,0,0,30.0000,300.000


In [40]:
titanic['fare_EUR'] = titanic.fare / 1.1
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck,T_T_t,inflation,fare_EUR
0,0,3,male,22.0,1,0,7.25,S,,0,72.5,6.590909
1,1,1,female,38.0,1,0,71.2833,C,C,0,712.833,64.803
2,1,3,female,26.0,0,0,7.925,S,,0,79.25,7.204545
3,1,1,female,35.0,1,0,53.1,S,C,0,531.0,48.272727
4,0,3,male,35.0,0,0,8.05,S,,0,80.5,7.318182


### insert() - adauga o coloana pe o anumita pozitie
### loc - pozitia
### column - numele coloanei
### value - valoarea coloanei

In [41]:
new_value = titanic.sex + "sex"
titanic.insert(loc=3, column="gen", value=new_value)
titanic.head()

Unnamed: 0,survived,pclass,sex,gen,age,sibsp,parch,fare,embarked,deck,T_T_t,inflation,fare_EUR
0,0,3,male,malesex,22.0,1,0,7.25,S,,0,72.5,6.590909
1,1,1,female,femalesex,38.0,1,0,71.2833,C,C,0,712.833,64.803
2,1,3,female,femalesex,26.0,0,0,7.925,S,,0,79.25,7.204545
3,1,1,female,femalesex,35.0,1,0,53.1,S,C,0,531.0,48.272727
4,0,3,male,malesex,35.0,0,0,8.05,S,,0,80.5,7.318182


# Create DF with zip()
## Python code creerea unui df cu zip()


In [42]:
nationality = ["Argentina", "Portugal", "Brasil", "France", "Germany"]
club = ["FC Barcelona", " Juventus FC", " Paris SG", "FC Bayern"]
world_champion = [False,False,False,True,True]
height = [1.70, 1.87, 1.75, 1.78, 1.93 ]
goals = [45, 44, 28, 21, 0]

In [43]:
df_zipp = list(zip(nationality, club, world_champion,  height, goals))

In [44]:
messi, eu, tu, neuer = df_zipp

In [45]:
messi

('Argentina', 'FC Barcelona', False, 1.7, 45)

In [46]:
eu

('Portugal', ' Juventus FC', False, 1.87, 44)

In [47]:
df_manual = pd.DataFrame(data=[messi, eu, tu, neuer],
                         index=["Lion Messi", "Christiano Ronaldo", "Neymar Junior", "Pisici"],
                         columns=["Nationality", "Club", "World_Chamipon", " Height", "Goals_2018"])

In [48]:
df_manual

Unnamed: 0,Nationality,Club,World_Chamipon,Height,Goals_2018
Lion Messi,Argentina,FC Barcelona,False,1.7,45
Christiano Ronaldo,Portugal,Juventus FC,False,1.87,44
Neymar Junior,Brasil,Paris SG,False,1.75,28
Pisici,France,FC Bayern,True,1.78,21


# V2 creerea df dict()

In [53]:
players = ["Lion Messi", "Christiano Ronaldo", "Neymar Junior", "Manuel Neuer", "Robert Lew"]
nationality = ["Argentina", "Portugal", "Brasil", "France", "Germany"]
club = ["FC Barcelona", " Juventus FC", " Paris SG", "FC Bayern", "FC Bayern"]
world_champion = [False,False,False,True,True]
height = [1.70, 1.87, 1.75, 1.78, 1.93]
goals = [45, 44, 28, 21, 0]

In [54]:
dic = {"Players":players, "National:": nationality, "Club":club, "World Champion":world_champion, "Height":height, "Goals":goals}

In [61]:
data_new = pd.DataFrame(data=dic)
data_new.head()

Unnamed: 0,Players,National:,Club,World Champion,Height,Goals
0,Lion Messi,Argentina,FC Barcelona,False,1.7,45
1,Christiano Ronaldo,Portugal,Juventus FC,False,1.87,44
2,Neymar Junior,Brasil,Paris SG,False,1.75,28
3,Manuel Neuer,France,FC Bayern,True,1.78,21
4,Robert Lew,Germany,FC Bayern,True,1.93,0


# Add Rows in existing DataFrame

In [64]:
new = pd.DataFrame(data=[["Darius", "Romania", "FC Barcelona", False, 1.80, 100]], columns=data_new.columns)
new

Unnamed: 0,Players,National:,Club,World Champion,Height,Goals
0,Darius,Romania,FC Barcelona,False,1.8,100


In [66]:
data_concat = pd.concat([data_new, new] , ignore_index=True)
data_concat

Unnamed: 0,Players,National:,Club,World Champion,Height,Goals
0,Lion Messi,Argentina,FC Barcelona,False,1.7,45
1,Christiano Ronaldo,Portugal,Juventus FC,False,1.87,44
2,Neymar Junior,Brasil,Paris SG,False,1.75,28
3,Manuel Neuer,France,FC Bayern,True,1.78,21
4,Robert Lew,Germany,FC Bayern,True,1.93,0
5,Darius,Romania,FC Barcelona,False,1.8,100
