In [2]:
import pandas as pd
import numpy as np

First task

In [79]:
# Read the content of the csv to a pandas DataFrame
# Set the index column to 'Id'
# Without index_col=0 you will get an Unnamed column
df = pd.read_csv('Iris_1.csv',index_col=0)  
df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [80]:
# We want to work with the columns that contains numbers (floats)
number_columns = df.loc[:,"SepalLengthCm":"PetalWidthCm"]
number_columns.columns


Index(['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'], dtype='object')

In [81]:
# Isna will only recognize np.nans and not strings
df.isna().sum()

Id                0
SepalLengthCm    10
SepalWidthCm      3
PetalLengthCm     7
PetalWidthCm      0
Species           0
dtype: int64

In [82]:
# Showcase the unique values of each colunm, 
# nan => np.nan (fillna will recognize this)
# 'Nan' => string (fillna won't recognize this)

uniques = [number_columns[column].unique() for column in number_columns.columns.tolist()]    # list comprehension (one line for)
uniques

[array([5.1, 4.9, 4.7, 4.6, nan, 5.4, 4.4, 4.8, 4.3, 5.8, 5.7, 5.2, 5.5,
        4.5, 5.3, 7. , 6.4, 6.9, 6.5, 6.3, 6.6, 5.9, 6. , 6.1, 5.6, 6.7,
        6.2, 6.8, 7.1, 7.6, 7.3, 7.2, 7.7, 7.4, 7.9]),
 array([3.5, 3. , 3.2, 3.1, 3.6, 3.9, 3.4, 2.9, 3.7, 4. , 4.4, 3.8, 3.3,
        4.1, 4.2, 2.3, 2.8, 2.4, 2.7, 2. , nan, 2.5, 2.6]),
 array([1.4, 1.3, 1.5, 1.7, 1.6, 1.1, 1.2, 1. , 1.9, 4.7, 4.5, 4.9, 4. ,
        nan, 3.3, 3.9, 3.5, 4.2, 3.6, 4.4, 4.1, 4.8, 4.3, 3.8, 3.7, 5.1,
        3. , 6. , 5.9, 5.6, 5.8, 6.6, 6.3, 6.1, 5.3, 5.5, 6.7, 6.9, 5.7,
        6.4, 5.4, 5.2]),
 array(['0.2', '0.4', '0.3', '0.1', '0.5', '0.6', '1.4', '1.5', '1.3',
        '1.6', 'Nan', '1.1', '1.8', '1.2', '1.7', '2.5', '1.9', '2.1',
        '2.2', '2.0', '2.4', '2.3'], dtype=object)]

In [83]:
# After we check the types of the missing values 
# Replace the strings to np.nan
df.replace('Nan',np.nan,inplace=True)

In [84]:
# Now isna will recognize all the missing values
df.isna().sum()

Id                0
SepalLengthCm    10
SepalWidthCm      3
PetalLengthCm     7
PetalWidthCm      7
Species           0
dtype: int64

In [85]:
# Sum of all missing values
df.isna().sum().sum()

27

In [86]:
# df to work with
number_columns

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [87]:
# calculate the mean of every column 
# by indexing the df with column names we get the columns as pd.Series
# and we can call the mean() function
# a safety step is to cast the column values to float
means = [df[column].astype(float).mean() for column in number_columns.columns]   # list comprehension (one line for)

In [88]:
# list of column means
means

[5.9035714285714285,
 3.0714285714285716,
 3.7062937062937062,
 1.2083916083916084]

In [90]:
# replace nan values with the means 
# [(mean_1,column_1),(mean_2,column_2),.....] => zip creates a list like this (google it for further info)
for mean,column in zip(means,number_columns.columns):
    df[column].replace(np.nan,round(mean,1),inplace= True)

In [91]:
df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.9,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,3.7,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [92]:
# nans are disapeared
df.isna().sum().sum()

0

Second task

In [118]:
df = pd.read_csv('Iris.csv')

In [95]:
# group by Species and call some aggregation functions

df.groupby("Species").agg({"Species":"count","SepalLengthCm":"max","PetalWidthCm":"min","SepalWidthCm":"mean"})

Unnamed: 0_level_0,Species,SepalLengthCm,PetalWidthCm,SepalWidthCm
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Iris-setosa,50,5.8,0.1,3.418
Iris-versicolor,50,7.0,1.0,2.77
Iris-virginica,50,7.9,1.4,2.974


In [100]:
#first 30 rows and first 3 columns
df_1 = pd.read_csv("Iris.csv",nrows=30,usecols=range(0,3))
#first 30 rows and last 3 columns
df_2 = pd.read_csv("Iris.csv",nrows=30,usecols=range(3,6))
#remaining rows and first 3 columns
df_3 = pd.read_csv("Iris.csv",nrows=120,skiprows=30,usecols=range(0,3),names= ["Id","SepalLengthCm","SepalWidthCm"])
#remaining rows and last 3 columns
df_4 = pd.read_csv("Iris.csv",nrows=120,skiprows=30,usecols=range(3,6),names = ["PetalLengthCm","PetalWidthCm","Species"])

In [102]:
#first 'half'
first_df = pd.concat([df_1,df_2],axis=1)
#second 'half'
second_df= pd.concat([df_3,df_4],axis=1)
#whole df
whole_df = pd.concat([first_df,second_df],axis=0)

In [103]:
whole_df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
115,145,6.7,3.3,5.7,2.5,Iris-virginica
116,146,6.7,3.0,5.2,2.3,Iris-virginica
117,147,6.3,2.5,5.0,1.9,Iris-virginica
118,148,6.5,3.0,5.2,2.0,Iris-virginica


Third task

In [119]:
import random
# randomly sort ascendingly or descendingly the columns 
# I used list comprehension with if else in it (google it like this for further info)
# syntax : [executes when condition True  if some condition else executes when condition False for something in somethings]
# here you should use regular for not the one liner, but I wanted to show you
# how to insert if else in the one liner

[df[column].astype(float).sort_values(ascending=True,inplace=True)       #if random == 0 sort by ascending
                  if random.randint(0,1) == 0 
                  else df[column].astype(float).sort_values(ascending=False,inplace=True) #if random != 0 sort by descending
                  for column in df.columns[1:-1].tolist()]                   #df.columns[1:-1].tolist() ==  ['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']	

In [122]:
df

Unnamed: 0,0,1,2,3
0,7.9,4.4,1.0,0.1
1,7.7,4.2,1.1,0.1
2,7.7,4.1,1.2,0.1
3,7.7,4.0,1.2,0.1
4,7.7,3.9,1.3,0.1
...,...,...,...,...
145,4.5,2.3,6.4,2.4
146,4.4,2.2,6.6,2.4
147,4.4,2.2,6.7,2.5
148,4.4,2.2,6.7,2.5


In [129]:
# Here I want to sort ascending row wise by the first column
# so simply I call sort_values on df with the corresponding arguments
df.sort_values(by=0, axis=0,inplace=True)

In [130]:
df

Unnamed: 0,0,1,2,3
149,4.3,2.0,6.9,2.5
146,4.4,2.2,6.6,2.4
148,4.4,2.2,6.7,2.5
147,4.4,2.2,6.7,2.5
145,4.5,2.3,6.4,2.4
...,...,...,...,...
4,7.7,3.9,1.3,0.1
3,7.7,4.0,1.2,0.1
2,7.7,4.1,1.2,0.1
1,7.7,4.2,1.1,0.1


In [132]:
# Last but not least you can see that the indexes are a bit off
# so simply reset the indexes
# IMPORTANT i used inplace so I don't have to write like this:
# df = df.reset_index(....)
# without inplace you have to write as aforesaid, otherwise it won't have an impact on df
df.reset_index(drop=True,inplace=True)
df

Unnamed: 0,0,1,2,3
0,4.3,2.0,6.9,2.5
1,4.4,2.2,6.6,2.4
2,4.4,2.2,6.7,2.5
3,4.4,2.2,6.7,2.5
4,4.5,2.3,6.4,2.4
...,...,...,...,...
145,7.7,3.9,1.3,0.1
146,7.7,4.0,1.2,0.1
147,7.7,4.1,1.2,0.1
148,7.7,4.2,1.1,0.1
