In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('survey_results_public.csv', index_col = 'Respondent')
# loading the other csv file, schema tells us what questions were asked during the survey  
schema_df = pd.read_csv('survey_results_schema.csv', index_col = 'Column')

In [None]:
# to know about your data frame, u can print .shape attribute to know how much rows and columns are in 
df.shape

In [None]:
# or we can use .info() method in order to know more about our data frame
# prints out name of each column as well as data types as well as the number of entries etc
df.info()

In [None]:
# we can set option if we want to see the desirable number of rows or columsn like this
# after printing out the number of columns we can see that it prints out all the numbers of columns 
pd.set_option('display.max_columns',61)

In [None]:
# in order to view some specific rows from start or the end we use the .head() and .tail() methods from df e.g
# we can specify the no of rows we want to see, inside the method 
df.head(15)

In [None]:
# if we want to access multiple columns of the df, we can pass a list of the 
df[['Country','Gender']]

In [None]:
# we can acess the row we want to access via loc and iloc (with loc u can use labels, the columns names)
df.iloc[1]
# we can pass a list of rows we want df.iloc[[1,2,3]]
# similarly we can view the columns too by i.loc[1, 2] # the  mean the first row and 2nd column 

In [None]:
# if we want to get the count of unique values, number of "yes" or "no" are counted as responses
df['Hobbyist'].value_counts() 

In [None]:
# u can also pass indexes as the labels like, rows 0:2 and columns from hobb to empl
df.loc[1:2, 'Hobbyist':'Employment']

<font size="5">INDEXING</font>

In [None]:
# if we want a specific column to be set as an index we can do so by by 
df.set_index('Respondent')
# we can make a column our index column by set index method as well as by specifying is the read_csv method by index_col
# after running this we will have the index as the above mentioned but pandas don't explicitly change the dataFrame
# until we mention inplace=True inside the set_index function, WOhoooo

In [None]:
# after setting up the index col arg in read_csv of schema we can search for a specific row e.g Respondant  
# specific column in that row e.g questiontext col in Respondant
schema_df.loc['Respondent','QuestionText']

In [None]:
# we can also sort the indexes by ascending as well as descending
schema_df.sort_index()

<font size="5"> Filtering </font>

In [None]:
# if one wants to see whether a specific column has the Value equal to ..
# this return a kind of filter that has true and false values, whether our querry matched or not
# we can use this filter and apply it to the data Frame
filt = (df['Country']=='Pakistan')
df[filt]

In [None]:
# we can filter our dataFrame through a .loc method like this (this method is preferrable)
df.loc[filt] # the 2nd arg pass the column you want from that specific row

In [None]:
# using operators for filtering &, | etc
# using & to narrow down the search
# filt2 = (df['Country']=='Brazil') & (df['Age']>30)
filt2 = (df['Country']=='Pakistan') | (df['Country']=='Brazil')
df.loc[filt2]
# the ~ operator filter the opposite to the filter like ~filt2 will query all the values other than pak and brazil
# df.loc[~filt2]

In [None]:
# the specific columns you want from the df after applying the filter 
df.loc[filt2, ['Age','Country']]

In [None]:
# filter can be applied this way too
countries = ['United States', 'United Kingdom', 'India', 'Canada']
filt3 = df['Country'].isin(countries)
df.loc[filt3, 'Country']

In [None]:
# if we want to grab some specific string values in rows of a column we can use the string methods as follows 
filt4 = df['LanguageWorkedWith'].str.contains('Python', na=False)
df.loc[filt4, 'LanguageWorkedWith']

<font size="5"> Altering the data </font>

In [None]:
# we can rename the columns we want to like, we can pass the dictionary to the 
df.rename(columns={'Country':'country', 'Age':'age'}) 
#if you want the changes to take place inplace=True is must

In [None]:
# altering the data in the rows
# we can just pass the values through the list like this, bu the length of list must be equal to the values passed 
# these aren't enough and will give one an error, it is a mess for greater values 
# df.loc[2] = ['i am not a professional dev', 'Yes']
# we can take out the specific values like this and change 
df.loc[2, ['Age', 'Country']] = [19, 'United Kingdom']

In [None]:
# pandas have specific method .at() for changing a specific value, instead of .loc
df.at[2, 'Age'] = 19

<font size="3">Updating multiple rows and columns </font> 
<br>
e.g if we want to update all the Mainbranch to lowercase we can use str.lower mehtod

In [None]:
df['MainBranch'] = df['MainBranch'].str.lower()

There are 4 specific methods used to make changes and people often get confused these 4 methods are <br>
apply, map, applymap, replace

<font size="3"> Apply function on a series </font>

In [None]:
def updateMainBranch(branch):
    return branch.upper()

In [None]:
#  we apply the function without passing the paranthesis inside, mostly lambdas func are used in apply()
# df['MainBranch'].head(10).apply(updateMainBranch) 
df['MainBranch'].head(10).apply(lambda x:x.lower())

<font size="3"> Apply function on a dataFrame </font>

In [None]:
# this return the values or no of rows in each column or series
df.apply(len, axis='columns') # we can mention the axix="column" or row arg inside the apply 