# <font color="purple"><h3 align="center">DataFrame Basics Tutorial</h3></font>

## **Dataframe is most commonly used object in pandas. It is a table like datastructure containing rows and columns similar to excel spreadsheet**

In [1]:
import pandas as pd
weather_data = {
    'day': ['1/1/2017','1/2/2017','1/3/2017','1/4/2017','1/5/2017','1/6/2017'],
    'temperature': [32,35,28,24,32,31],
    'windspeed': [6,7,2,7,4,2],
    'event': ['Rain', 'Sunny', 'Snow','Snow','Sunny', 'Sunny']
}
df = pd.DataFrame(weather_data)
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Sunny
5,1/6/2017,31,2,Sunny


In [27]:
df.shape # rows, columns = df.shape

(6, 4)

In [2]:
newdf = df[2:5]
newdf

Unnamed: 0,day,temperature,windspeed,event
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Sunny


In [5]:
newdf = df.iloc[2:5, :-1]
newdf

Unnamed: 0,day,temperature,windspeed
2,1/3/2017,28,2
3,1/4/2017,24,7
4,1/5/2017,32,4


## <font color='blue'>Rows</font>

In [30]:
df.head(3) # df.head(3) 

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow


In [31]:
df.tail(2) # df.tail(2)

Unnamed: 0,day,temperature,windspeed,event
4,1/5/2017,32,4,Sunny
5,1/6/2017,31,2,Sunny


In [32]:
df.index[df.day ==  "1/4/2017"][0]

3

In [33]:
startindex = int(df.index[df.day ==  "1/4/2017"][0])
startindex

3

In [34]:
df[ startindex : startindex + 3 ]

Unnamed: 0,day,temperature,windspeed,event
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Sunny
5,1/6/2017,31,2,Sunny


## <font color='blue'>Columns</font>

In [35]:
# print(df.columns)
columnNames = df.columns.to_list()
print(columnNames)

['day', 'temperature', 'windspeed', 'event']


In [36]:
df.event

0     Rain
1    Sunny
2     Snow
3     Snow
4    Sunny
5    Sunny
Name: event, dtype: object

In [37]:
eventlist = df.event.to_list()
eventlist

['Rain', 'Sunny', 'Snow', 'Snow', 'Sunny', 'Sunny']

In [38]:
twodf = df[ ['day','event']  ]
twodf

Unnamed: 0,day,event
0,1/1/2017,Rain
1,1/2/2017,Sunny
2,1/3/2017,Snow
3,1/4/2017,Snow
4,1/5/2017,Sunny
5,1/6/2017,Sunny


In [39]:
df[["day", "event"]]

Unnamed: 0,day,event
0,1/1/2017,Rain
1,1/2/2017,Sunny
2,1/3/2017,Snow
3,1/4/2017,Snow
4,1/5/2017,Sunny
5,1/6/2017,Sunny


## <font color='blue'>Operations On DataFrame</font>

In [40]:
print(df.temperature.mean())
print(df['temperature'].std())

30.333333333333332
3.8297084310253524


In [41]:
len(df[ df['temperature'] > 30 ])

4

In [42]:
df['day'] [ df['temperature'] == df['temperature'].min() ] # Kinda doing SQL in pandas

3    1/4/2017
Name: day, dtype: object

In [43]:
df['day'][df['temperature'] == df['temperature'].min()] # Kinda doing SQL in pandas

3    1/4/2017
Name: day, dtype: object

In [44]:
data = df.temperature.sort_values()
data

3    24
2    28
5    31
0    32
4    32
1    35
Name: temperature, dtype: int64

In [45]:
df['temperature'].mean() # Kinda doing SQL in pandas

30.333333333333332

In [46]:
df['temperature'].std()

3.8297084310253524

In [47]:
df['event'].max() # But mean() won't work since data type is string

'Sunny'

In [48]:
# get max occuring element 

In [49]:
df.event.value_counts()

Sunny    3
Snow     2
Rain     1
Name: event, dtype: int64

In [50]:
df.event.value_counts().index[0]

'Sunny'

In [51]:
df.event.value_counts()[0]

3

In [52]:
print("Max Event : " + df.event.value_counts().index[0] + " and it occured " + str( df.event.value_counts()[0]) + " times" )

Max Event : Sunny and it occured 3 times


In [53]:
df.describe()

Unnamed: 0,temperature,windspeed
count,6.0,6.0
mean,30.333333,4.666667
std,3.829708,2.33809
min,24.0,2.0
25%,28.75,2.5
50%,31.5,5.0
75%,32.0,6.75
max,35.0,7.0


**Google pandas series operations to find out list of all operations**
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.html

## <font color='blue'>set_index</font>

In [54]:
df.set_index("day", inplace=True)

In [55]:
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/1/2017,32,6,Rain
1/2/2017,35,7,Sunny
1/3/2017,28,2,Snow
1/4/2017,24,7,Snow
1/5/2017,32,4,Sunny
1/6/2017,31,2,Sunny


In [56]:
df.shape

(6, 3)

In [57]:
df['day'][df['temperature'] == df['temperature'].min()] # Kinda doing SQL in pandas

KeyError: 'day'

In [58]:
df.loc["1/1/2017" : "1/4/2017"]

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/1/2017,32,6,Rain
1/2/2017,35,7,Sunny
1/3/2017,28,2,Snow
1/4/2017,24,7,Snow


In [59]:
df.shape

(6, 3)

In [93]:
df.reset_index(inplace=True)
df

Unnamed: 0,index,day,temperature,windspeed,event
0,0,1/1/2017,32,6,Rain
1,1,1/2/2017,35,7,Sunny
2,2,1/3/2017,28,2,Snow
3,3,1/4/2017,24,7,Snow
4,4,1/5/2017,32,4,Sunny
5,5,1/6/2017,31,2,Sunny


In [94]:
df.shape

(6, 5)

In [95]:
df

Unnamed: 0,index,day,temperature,windspeed,event
0,0,1/1/2017,32,6,Rain
1,1,1/2/2017,35,7,Sunny
2,2,1/3/2017,28,2,Snow
3,3,1/4/2017,24,7,Snow
4,4,1/5/2017,32,4,Sunny
5,5,1/6/2017,31,2,Sunny


In [96]:
df.reset_index(inplace=True)

In [97]:
newdf = df.copy()
newdf.set_index("day" , inplace=True)

In [98]:
newdf

Unnamed: 0_level_0,level_0,index,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1/1/2017,0,0,32,6,Rain
1/2/2017,1,1,35,7,Sunny
1/3/2017,2,2,28,2,Snow
1/4/2017,3,3,24,7,Snow
1/5/2017,4,4,32,4,Sunny
1/6/2017,5,5,31,2,Sunny


In [99]:
df

Unnamed: 0,level_0,index,day,temperature,windspeed,event
0,0,0,1/1/2017,32,6,Rain
1,1,1,1/2/2017,35,7,Sunny
2,2,2,1/3/2017,28,2,Snow
3,3,3,1/4/2017,24,7,Snow
4,4,4,1/5/2017,32,4,Sunny
5,5,5,1/6/2017,31,2,Sunny


In [100]:
newdf.set_index("day" , inplace=True)

KeyError: "None of ['day'] are in the columns"

In [101]:
newdf

Unnamed: 0_level_0,level_0,index,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1/1/2017,0,0,32,6,Rain
1/2/2017,1,1,35,7,Sunny
1/3/2017,2,2,28,2,Snow
1/4/2017,3,3,24,7,Snow
1/5/2017,4,4,32,4,Sunny
1/6/2017,5,5,31,2,Sunny


In [102]:
newdf.reset_index(inplace=True)
newdf

Unnamed: 0,day,level_0,index,temperature,windspeed,event
0,1/1/2017,0,0,32,6,Rain
1,1/2/2017,1,1,35,7,Sunny
2,1/3/2017,2,2,28,2,Snow
3,1/4/2017,3,3,24,7,Snow
4,1/5/2017,4,4,32,4,Sunny
5,1/6/2017,5,5,31,2,Sunny


In [103]:
newdf.set_index("event", inplace=True)
newdf

Unnamed: 0_level_0,day,level_0,index,temperature,windspeed
event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Rain,1/1/2017,0,0,32,6
Sunny,1/2/2017,1,1,35,7
Snow,1/3/2017,2,2,28,2
Snow,1/4/2017,3,3,24,7
Sunny,1/5/2017,4,4,32,4
Sunny,1/6/2017,5,5,31,2


In [104]:
newdf.loc["Sunny"]

Unnamed: 0_level_0,day,level_0,index,temperature,windspeed
event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Sunny,1/2/2017,1,1,35,7
Sunny,1/5/2017,4,4,32,4
Sunny,1/6/2017,5,5,31,2


In [105]:
newdf.loc["Snow"]

Unnamed: 0_level_0,day,level_0,index,temperature,windspeed
event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Snow,1/3/2017,2,2,28,2
Snow,1/4/2017,3,3,24,7


In [106]:
df["NewData"] = np.arange(1,7) # [ 0,1,2,3,4,5]
df

NameError: name 'np' is not defined

In [107]:
mylist = []
for i in df.temperature:
    if i == 32:
        mylist.append(True)
    else:
        mylist.append(False)
df["Check"] = mylist

In [108]:
df

Unnamed: 0,level_0,index,day,temperature,windspeed,event,Check
0,0,0,1/1/2017,32,6,Rain,True
1,1,1,1/2/2017,35,7,Sunny,False
2,2,2,1/3/2017,28,2,Snow,False
3,3,3,1/4/2017,24,7,Snow,False
4,4,4,1/5/2017,32,4,Sunny,True
5,5,5,1/6/2017,31,2,Sunny,False


In [109]:
df["Names"] = np.array(["Ahmed", "Ali", "Omar" , "Emad" , "Anas" , "Amr" ])
df

NameError: name 'np' is not defined

In [110]:
df.reset_index(inplace=True)
df.drop("event", inplace=True, axis= 1)
df

ValueError: cannot insert level_0, already exists

In [111]:
df.drop("index", inplace=True, axis = 1)

In [112]:
df

Unnamed: 0,level_0,day,temperature,windspeed,event,Check
0,0,1/1/2017,32,6,Rain,True
1,1,1/2/2017,35,7,Sunny,False
2,2,1/3/2017,28,2,Snow,False
3,3,1/4/2017,24,7,Snow,False
4,4,1/5/2017,32,4,Sunny,True
5,5,1/6/2017,31,2,Sunny,False


In [113]:
df.index

RangeIndex(start=0, stop=6, step=1)

In [114]:
#df.reset_index(inplace=True)
df.set_index("temperature", inplace=True)

In [115]:
df

Unnamed: 0_level_0,level_0,day,windspeed,event,Check
temperature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
32,0,1/1/2017,6,Rain,True
35,1,1/2/2017,7,Sunny,False
28,2,1/3/2017,2,Snow,False
24,3,1/4/2017,7,Snow,False
32,4,1/5/2017,4,Sunny,True
31,5,1/6/2017,2,Sunny,False


In [116]:
df.loc[32]

Unnamed: 0_level_0,level_0,day,windspeed,event,Check
temperature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
32,0,1/1/2017,6,Rain,True
32,4,1/5/2017,4,Sunny,True


In [117]:
df.head()

Unnamed: 0_level_0,level_0,day,windspeed,event,Check
temperature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
32,0,1/1/2017,6,Rain,True
35,1,1/2/2017,7,Sunny,False
28,2,1/3/2017,2,Snow,False
24,3,1/4/2017,7,Snow,False
32,4,1/5/2017,4,Sunny,True


In [118]:
weather_data = {
    'day': ['1/1/2017','1/2/2017','1/3/2017','1/4/2017','1/5/2017','1/6/2017'],
    'temperature': [32,35,28,24,32,31],
    'windspeed': [6,7,2,7,4,2],
    'event': ['Rain', 'Sunny', 'Snow','Snow','Sunny', 'Sunny']
}
df = pd.DataFrame(weather_data)
df.set_index('event',inplace=True) # this is kind of building a hash map using event as a key
df

Unnamed: 0_level_0,day,temperature,windspeed
event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Rain,1/1/2017,32,6
Sunny,1/2/2017,35,7
Snow,1/3/2017,28,2
Snow,1/4/2017,24,7
Sunny,1/5/2017,32,4
Sunny,1/6/2017,31,2


In [119]:
df

Unnamed: 0_level_0,day,temperature,windspeed
event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Rain,1/1/2017,32,6
Sunny,1/2/2017,35,7
Snow,1/3/2017,28,2
Snow,1/4/2017,24,7
Sunny,1/5/2017,32,4
Sunny,1/6/2017,31,2


In [120]:
columns = []
data = dict()

num = int(input("please enter the number of columns"))
while(num > 0):
    columns.append(input("please enter the column name: "))
    num -=1

for i in columns:
    data[i] = []

rows = int(input("please enter the number of rows: "))
while(rows > 0):
    for i in data:
        value = input(f"please enter the value of {i}: ")
        data[i].append(value)

    rows-=1
dataframe = pd.DataFrame(data)
dataframe

please enter the number of columns


ValueError: invalid literal for int() with base 10: ''

In [None]:
dataframe["Id"].dtype

In [None]:
dataframe.id.astype(np.int8)

In [None]:
dataframe.Age = dataframe.Age.astype(np.int16)
dataframe.Age.dtype