## Program for returning the first year a value is greater than 0

In [1]:
import pandas as pd
import numpy as np

In [5]:
# read in the csv file as a pandas dataframe
df = pd.read_csv('Numbers.csv')

# show dataframe
df

Unnamed: 0,2015,2016,2017,2018,2019,2020
0,100,100,0,0,0,0
1,100,0,0,0,0,0
2,100,100,100,100,100,100
3,0,0,0,0,0,0
4,100,100,100,100,0,0
5,100,0,0,100,0,0


In [8]:
# get the years from the columns as a list
years = df.columns

In [9]:
def find_first_year(row):
    '''
    This function finds the first
    year in which the value in a 
    given row is greater than 0

    Parameters
    ----------
    row : Pandas row

    Returns
    -------
    year (int) if condition satisfied, 'N/A' otherwise

    '''
    # (try) if there exists a nonzero value in the row,
    # A is assigned the max index 
    try:
        # np.where returns an array of the indeces where condtion
        # is satisfied, np.max returns the max vlaue which corresponds
        # to the first year in which the condition was satisfied
        A = np.max(np.where(row > 0))
        
        # return appropriate year from the years list
        return years[A]
    
    # (except) else, an error occured, this means no value greater
    # than 0 was found and 'N/A' is returned
    except:
        return 'N/A'

In [10]:
# apply the above function to each row in the dataframe, axis=1 argument applies the function to each row
df['Year'] = df.apply(find_first_year, axis=1)
df

Unnamed: 0,2015,2016,2017,2018,2019,2020,Year
0,100,100,0,0,0,0,2016.0
1,100,0,0,0,0,0,2015.0
2,100,100,100,100,100,100,2020.0
3,0,0,0,0,0,0,
4,100,100,100,100,0,0,2018.0
5,100,0,0,100,0,0,2018.0


In [11]:
# move year column to front of the dataframe
df.insert(0, 'Year', df.pop('Year'))
df

Unnamed: 0,Year,2015,2016,2017,2018,2019,2020
0,2016.0,100,100,0,0,0,0
1,2015.0,100,0,0,0,0,0
2,2020.0,100,100,100,100,100,100
3,,0,0,0,0,0,0
4,2018.0,100,100,100,100,0,0
5,2018.0,100,0,0,100,0,0


In [12]:
# convert dataframe back into a csv file without the index
# df.to_csv('NumberWithRecentYear.csv', index=False)

## Program for finding the proportion of misspellings of the word 'orange'

In [14]:
# read in the csv file as a pandas dataframe
df = pd.read_csv('Words.csv')
df

Unnamed: 0,Name
0,Orange
1,OrAnge
2,Orenge
3,Orang
4,Orage
5,Orange
6,orange
7,cabbage
8,Lettuce
9,Butter


In [17]:
# get the name column as a pandas series
s = df['Name']

# convert every letter to lowercase
s = s.str.lower()
s

0      orange
1      orange
2      orenge
3       orang
4       orage
5      orange
6      orange
7     cabbage
8     lettuce
9      butter
10       milk
Name: Name, dtype: object

In [18]:
# create a boolean array for words that contain 'or' and return the
# true elements of the array using [].
s = s[s.str.contains('or')]
s

0    orange
1    orange
2    orenge
3     orang
4     orage
5    orange
6    orange
Name: Name, dtype: object

In [19]:
# now count the number of correct spellings of the word 'orange' using the 
# value count method. normalize=True returns the proportion of each spelling
s = s.value_counts(normalize=True)
s

orange    0.571429
orenge    0.142857
orang     0.142857
orage     0.142857
Name: Name, dtype: float64

In [20]:
# get the proportion of correct spellings of the word 'orange'
correct = s['orange']
correct

0.5714285714285714

In [21]:
# use f string to report the proportions (.2 rounds the values to two decimals)
print(f'The proportion of correct spellings of the word orange is {correct:.2}')
print(f'The proportion of incorrect spellings of the word orange is {1-correct:.2}')

The proportion of correct spellings of the word orange is 0.57
The proportion of incorrect spellings of the word orange is 0.43
