# Importing Text Files in Python (open())

### open()  read()   .close()  .closed

In [1]:
filename = "source.txt"

In [2]:
filename 

'source.txt'

In [3]:
file = open(filename, mode = 'r')

In [4]:
file # wrapper

<_io.TextIOWrapper name='source.txt' mode='r' encoding='cp1252'>

In [5]:
file.read() # once clicked -> output; clicked any more than once -> empty string

'It is not so simple anymore!'

In [6]:
# re-writing the above lines of code in a shorter format (better practice as it avoids clutter.)
file = open(filename, mode = 'r')
text = file.read()
text

'It is not so simple anymore!'

In [7]:
# closing the file (most important action; leaving an open connection is endangering the file and our control over it.)
file.close()

In [8]:
file.closed

True

## with - instead of using Python's open() function alone, we can rather combine it with the 'with' statement.

### with statement allows us to have the control over the desired status of the file as well as its contents. As it by default closes the file. This is a better practise.

In [9]:
filename = 'source.txt'

In [10]:
file = open(filename, mode = 'r')
text = file.read()
file.close()

In [11]:
with open(filename, mode = "r") as out_file:
    out_file.read()

In [12]:
print(text) # content

It is not so simple anymore!


In [13]:
print(out_file) # wrapper; empty

<_io.TextIOWrapper name='source.txt' mode='r' encoding='cp1252'>


## Using 'w' mode, we intend to write within the file (manipulate the information of the file)

In [14]:
with open(filename, mode = 'w') as out_file:
    out_file.write('It is not so simple anymore!')

In [15]:
print(out_file)

<_io.TextIOWrapper name='source.txt' mode='w' encoding='cp1252'>


In [16]:
print(text) #bcz the changes are NOW saved in the og text file. & if we want to see 'em we run it from the top.

It is not so simple anymore!


# Importing .csv files with Pandas.

### pandas can turn the information into a table (dataFrame) using a default parser. But in some cases the data would be messier and to be sure that our data is not messy we use Sublime, Notebook++, and Adam. (To eyeball our Data.)

In [17]:
filename = 'Lending-company.csv'

file = open(filename, mode = 'r')
text = file.read()
text

'LoanID,StringID,Product,CustomerGender,Location,Region,TotalPrice,StartDate,Deposit,DailyRate,TotalDaysYr,AmtPaid36,AmtPaid60,AmtPaid360,LoanStatus\n1,LoanID_1,Product B,Female,Location 3,Region 2,17600,04/07/2018,2200,45,365,3221,4166,14621,Active\n2,LoanID_2,Product D,Female,Location 6,Region 6,,02/01/2019,2200,45,365,3161,4096,16041,Active\n3,LoanID_3,Product B,Male,Location 8,Region 3,16600,08/12/2016,1000,45,365,2260,3205,16340,\n4,LoanID_4,Product A,Male,Location 26,Region 2,17600,,2200,45,365,3141,4166,16321,Active\n5,LoanID_5,Product B,Female,Location 34,Region 3,21250,28/10/2017,2200,55,365,3570,4745,14720,Active\n6,LoanID_6,Product A,Male,Location 34,Region 1,,19/04/2019,2200,45,365,3301,4066,15141,Active\n7,LoanID_7,Product A,Male,Location 25,,21250,04/07/2020,2200,55,365,1951,3176,18701,Active\n8,LoanID_8,Product D,Male,Location 46,Region 5,17600,24/04/2018,2200,45,365,4071,4056,16351,Active\n9,LoanID_9,Product A,Male,Location 156,Region 6,23250,03/09/2019,5000,55,365,5850

In [18]:
type(text)  # so this comes out as a str and we want it in the form of a Table, clean (DataFrame), therefore...

str

In [19]:
import pandas as pd

In [20]:
pd.read_csv('Lending-company.csv', header = 0)

Unnamed: 0,LoanID,StringID,Product,CustomerGender,Location,Region,TotalPrice,StartDate,Deposit,DailyRate,TotalDaysYr,AmtPaid36,AmtPaid60,AmtPaid360,LoanStatus
0,1,LoanID_1,Product B,Female,Location 3,Region 2,17600.0,04/07/2018,2200,45,365,3221,4166,14621,Active
1,2,LoanID_2,Product D,Female,Location 6,Region 6,,02/01/2019,2200,45,365,3161,4096,16041,Active
2,3,LoanID_3,Product B,Male,Location 8,Region 3,16600.0,08/12/2016,1000,45,365,2260,3205,16340,
3,4,LoanID_4,Product A,Male,Location 26,Region 2,17600.0,,2200,45,365,3141,4166,16321,Active
4,5,LoanID_5,Product B,Female,Location 34,Region 3,21250.0,28/10/2017,2200,55,365,3570,4745,14720,Active
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1038,1039,LoanID_1039,Product B,Male,Location 73,Region 6,17300.0,29/12/2018,2200,45,365,3251,4743,16617,Finished Payment
1039,1040,LoanID_1040,Product A,Male,Location 82,Region 1,,28/03/2018,2200,45,365,4090,5582,16617,Finished Payment
1040,1041,LoanID_1041,Product A,NotSpecified,Location 11,Region 4,17300.0,26/04/2018,2200,45,365,4051,5143,16617,Finished Payment
1041,1042,LoanID_1042,Product B,Female,Location 26,Region 6,16300.0,25/10/2016,1000,45,365,1930,3462,15617,Finished Payment


In [21]:
type(pd.read_csv(filename)) # here it is. (Easier to understand.)

pandas.core.frame.DataFrame

In [22]:
# re-writing the above lines of code in a shorter format (better practice as it avoids clutter.)
filename = 'Lending-company.csv'
my_csv_data = pd.read_csv(filename)
my_csv_data

Unnamed: 0,LoanID,StringID,Product,CustomerGender,Location,Region,TotalPrice,StartDate,Deposit,DailyRate,TotalDaysYr,AmtPaid36,AmtPaid60,AmtPaid360,LoanStatus
0,1,LoanID_1,Product B,Female,Location 3,Region 2,17600.0,04/07/2018,2200,45,365,3221,4166,14621,Active
1,2,LoanID_2,Product D,Female,Location 6,Region 6,,02/01/2019,2200,45,365,3161,4096,16041,Active
2,3,LoanID_3,Product B,Male,Location 8,Region 3,16600.0,08/12/2016,1000,45,365,2260,3205,16340,
3,4,LoanID_4,Product A,Male,Location 26,Region 2,17600.0,,2200,45,365,3141,4166,16321,Active
4,5,LoanID_5,Product B,Female,Location 34,Region 3,21250.0,28/10/2017,2200,55,365,3570,4745,14720,Active
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1038,1039,LoanID_1039,Product B,Male,Location 73,Region 6,17300.0,29/12/2018,2200,45,365,3251,4743,16617,Finished Payment
1039,1040,LoanID_1040,Product A,Male,Location 82,Region 1,,28/03/2018,2200,45,365,4090,5582,16617,Finished Payment
1040,1041,LoanID_1041,Product A,NotSpecified,Location 11,Region 4,17300.0,26/04/2018,2200,45,365,4051,5143,16617,Finished Payment
1041,1042,LoanID_1042,Product B,Female,Location 26,Region 6,16300.0,25/10/2016,1000,45,365,1930,3462,15617,Finished Payment


In [23]:
type(my_csv_data)

pandas.core.frame.DataFrame

In [24]:
print(my_csv_data)

      LoanID     StringID    Product CustomerGender     Location    Region  \
0          1     LoanID_1  Product B         Female   Location 3  Region 2   
1          2     LoanID_2  Product D         Female   Location 6  Region 6   
2          3     LoanID_3  Product B           Male   Location 8  Region 3   
3          4     LoanID_4  Product A           Male  Location 26  Region 2   
4          5     LoanID_5  Product B         Female  Location 34  Region 3   
...      ...          ...        ...            ...          ...       ...   
1038    1039  LoanID_1039  Product B           Male  Location 73  Region 6   
1039    1040  LoanID_1040  Product A           Male  Location 82  Region 1   
1040    1041  LoanID_1041  Product A   NotSpecified  Location 11  Region 4   
1041    1042  LoanID_1042  Product B         Female  Location 26  Region 6   
1042    1043  LoanID_1043  Product A   NotSpecified  Location 94  Region 6   

      TotalPrice   StartDate  Deposit  DailyRate  TotalDaysYr  

# Importing Data with the Index_col Parameter. 

In [25]:
filename = 'Lending-company.csv'
file = pd.read_csv(filename)
file

Unnamed: 0,LoanID,StringID,Product,CustomerGender,Location,Region,TotalPrice,StartDate,Deposit,DailyRate,TotalDaysYr,AmtPaid36,AmtPaid60,AmtPaid360,LoanStatus
0,1,LoanID_1,Product B,Female,Location 3,Region 2,17600.0,04/07/2018,2200,45,365,3221,4166,14621,Active
1,2,LoanID_2,Product D,Female,Location 6,Region 6,,02/01/2019,2200,45,365,3161,4096,16041,Active
2,3,LoanID_3,Product B,Male,Location 8,Region 3,16600.0,08/12/2016,1000,45,365,2260,3205,16340,
3,4,LoanID_4,Product A,Male,Location 26,Region 2,17600.0,,2200,45,365,3141,4166,16321,Active
4,5,LoanID_5,Product B,Female,Location 34,Region 3,21250.0,28/10/2017,2200,55,365,3570,4745,14720,Active
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1038,1039,LoanID_1039,Product B,Male,Location 73,Region 6,17300.0,29/12/2018,2200,45,365,3251,4743,16617,Finished Payment
1039,1040,LoanID_1040,Product A,Male,Location 82,Region 1,,28/03/2018,2200,45,365,4090,5582,16617,Finished Payment
1040,1041,LoanID_1041,Product A,NotSpecified,Location 11,Region 4,17300.0,26/04/2018,2200,45,365,4051,5143,16617,Finished Payment
1041,1042,LoanID_1042,Product B,Female,Location 26,Region 6,16300.0,25/10/2016,1000,45,365,1930,3462,15617,Finished Payment


In [26]:
# the unique values' column is apt to be the index (LoanId). So we remove the initial indexing. 

In [27]:
filename = 'Lending-company.csv'
file = pd.read_csv(filename, index_col = 'LoanID')
file.head(10)

Unnamed: 0_level_0,StringID,Product,CustomerGender,Location,Region,TotalPrice,StartDate,Deposit,DailyRate,TotalDaysYr,AmtPaid36,AmtPaid60,AmtPaid360,LoanStatus
LoanID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,LoanID_1,Product B,Female,Location 3,Region 2,17600.0,04/07/2018,2200,45,365,3221,4166,14621,Active
2,LoanID_2,Product D,Female,Location 6,Region 6,,02/01/2019,2200,45,365,3161,4096,16041,Active
3,LoanID_3,Product B,Male,Location 8,Region 3,16600.0,08/12/2016,1000,45,365,2260,3205,16340,
4,LoanID_4,Product A,Male,Location 26,Region 2,17600.0,,2200,45,365,3141,4166,16321,Active
5,LoanID_5,Product B,Female,Location 34,Region 3,21250.0,28/10/2017,2200,55,365,3570,4745,14720,Active
6,LoanID_6,Product A,Male,Location 34,Region 1,,19/04/2019,2200,45,365,3301,4066,15141,Active
7,LoanID_7,Product A,Male,Location 25,,21250.0,04/07/2020,2200,55,365,1951,3176,18701,Active
8,LoanID_8,Product D,Male,Location 46,Region 5,17600.0,24/04/2018,2200,45,365,4071,4056,16351,Active
9,LoanID_9,Product A,Male,Location 156,Region 6,23250.0,03/09/2019,5000,55,365,5850,7375,21250,
10,LoanID_10,Product C,Male,Location 21,Region 9,21250.0,25/07/2020,2200,55,365,2051,3176,18351,Active


In [28]:
# here we go!

In [29]:
# to understand how this actually works...
data = {'ProductName':['Product A', 'Product B', 'Product C'], 'ProductPrice': [22250, 16600, 12500]}
product_IDs = ['A', 'B', 'C']
df = pd.DataFrame(data, index = product_IDs)
df

Unnamed: 0,ProductName,ProductPrice
A,Product A,22250
B,Product B,16600
C,Product C,12500


# Importing Data with NumPy

In [30]:
import numpy as np

## np.loadtxt()vs np.genfromtxt()


In [31]:
lending_company_data_1 = np.loadtxt('Lending-Company-Numeric-Data.csv', delimiter = ',')
lending_company_data_1

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [32]:
lending_company_data_2 = np.genfromtxt('Lending-Company-Numeric-Data.csv', delimiter = ',')
lending_company_data_2

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [33]:
# even though the outputs from both the functions appear similar, but we check if they're actually same...
np.array_equal(lending_company_data_1, lending_company_data_2)

True

### yes THEY ARE SAME, when the dataSet is complete! but the difference between the two is, .loadtxt() is Faster, but it breaks if we feed it incomplete or ill-formatted datasets and .genfromtxt() is a bit slower, but it handles missing values better.

In [34]:
# Understanding the 'Missing value' concept with an example.

In [35]:
lending_company_data_nan = np.loadtxt('Lending-Company-Numeric-Data-NAN.csv', delimiter = ";")
lending_company_data_nan

ValueError: could not convert string '' to float64 at row 11, column 4.

In [None]:
lending_company_data_nan = np.genfromtxt('Lending-Company-Numeric-Data-NAN.csv', delimiter = ";")
lending_company_data_nan

In [None]:
# the other way (with .loadtxt()) to print the data and let it not throw an error is...
# When we only need to deliberately observe the data values & don't have to execute any Math. ops. 
# A better practise (Not throwing an error + not making it complex + its fast here)... ONLY WHEN WE NEED TO OBSERVE. 


lending_company_data_nan = np.loadtxt('Lending-Company-Numeric-Data-NAN.csv', 
                                       delimiter = ";", 
                                       dtype = np.str_)
lending_company_data_nan

# Partially Cleaning our Data set while Importing it. 

In [None]:
lending_company_data_nan = np.genfromtxt('Lending-Company-Numeric-Data-NAN.csv', delimiter = ";")
lending_company_data_nan

## .genfromtxt() function has dozens of parameter we can choose from...

### removing the rows (unneccessary ones).

In [None]:
# ṛemoving the first two rows from the top (skip_header),
# we can use this when there is unneccessary comments (like these) at the top 

lending_company_data_nan = np.genfromtxt('Lending-Company-Numeric-Data-NAN.csv', 
                                          delimiter = ";",
                                          skip_header= 2)
lending_company_data_nan

In [None]:
lending_company_data_nan = np.genfromtxt('Lending-Company-Numeric-Data-NAN.csv', 
                                          delimiter = ";",
                                          skip_footer= 2)
lending_company_data_nan

# ṛemoving the last two rows from the bottom (skip_footer),
# we can use this when there is unneccessary comments (like these) at the bottom 

### removing the columns (unneccessary ones); More Precisely we're telling python which cols we want to see...

In [None]:
lending_company_data_nan = np.genfromtxt('Lending-Company-Numeric-Data-NAN.csv', 
                                          delimiter = ";",
                                          usecols= (1, 2))
lending_company_data_nan

In [None]:
lending_company_data_nan = np.genfromtxt('Lending-Company-Numeric-Data-NAN.csv', 
                                          delimiter = ";",
                                          usecols= (0))
lending_company_data_nan

In [None]:
lending_company_data_nan = np.genfromtxt('Lending-Company-Numeric-Data-NAN.csv', 
                                          delimiter = ";",
                                          usecols= (0, 2, 4))
lending_company_data_nan

### Combining skip_header/footers & usecols.

In [None]:
lending_company_data_nan = np.genfromtxt('Lending-Company-Numeric-Data-NAN.csv', 
                                          delimiter = ";",
                                          usecols = (0, 2, 4),
                                          skip_header = 2,
                                          skip_footer = 2)
lending_company_data_nan

### Separating the above three columns into three separate columns... (unpacking them into 3 separate columns.)

In [None]:
# first naming the variables to store the three separate cols to store in...

lending_company_data_3, lending_company_data_4, lending_company_data_5  = np.genfromtxt('Lending-Company-Numeric-Data-NAN.csv', 
                                          delimiter = ";",
                                          usecols = (0, 2, 4),
                                          skip_header = 2,
                                          skip_footer = 2)
lending_company_data_nan

In [None]:
# the above method only doesn't work... we have to use unpack parameter.
# unpack = tells the data to split into the number of variables mentioned. (here 3)

lending_company_data_3, lending_company_data_4, lending_company_data_5  = np.genfromtxt('Lending-Company-Numeric-Data-NAN.csv', 
                                                                                         delimiter = ";",
                                                                                         usecols = (0, 2, 4),
                                                                                         skip_header = 2,
                                                                                         skip_footer = 2,
                                                                                         unpack = True)

print(lending_company_data_3)
print(lending_company_data_4)
print(lending_company_data_5)

# Importing *.json Files.

## - Human Readable.

## - Universal Format.

## - Organised as plain text.

## - A very Long String.

In [36]:
prices_per_products = '{"Product A": 22250, "Product B": 16600, "Product C": 15600}'

In [37]:
prices_per_products

'{"Product A": 22250, "Product B": 16600, "Product C": 15600}'

In [46]:
type(prices_per_products)

str

In [47]:
import json

In [48]:
parsed_str = json.loads(prices_per_products)

In [49]:
parsed_str

{'Product A': 22250, 'Product B': 16600, 'Product C': 15600}

In [51]:
type(parsed_str)

dict

In [52]:
# most often we'd like to load the data in the tabular form therefore...

In [53]:
import pandas as pd

In [54]:
df = pd.read_json('Lending-company.json')
df

ValueError: Expected object or value

In [None]:
type(df)

# Working With Excel (.xlsx) Files in Python.

## Most of the cleaning we'll do with python initially is in .csv -> .xlsx format.

In [None]:
filename = 'Lending-Company-Numeric-Data.xlsx'
data = pd.read_excel(filename, )
data

In [None]:
filename = 'Lending-Company-Numeric-Data.xlsx'
data = pd.read_excel(filename, index_col = 'LoanID')
data

# Understanding the Potential of .read_csv()

In [None]:
location_data = pd.read_csv('Lending-company.csv', 
                             usecols = ['LoanID', 'StringID', 'CustomerGender', 'TotalPrice'], 
                             index_col = 'LoanID')
location_data[29:40] 

In [None]:
# until this point all we've done is the part of Data Collection Process (next Squeeze method too.)

# Importing Data With the Pandas Squeeze Method.

In [None]:
df = pd.read_csv('Lending-company.csv')
df.head()

In [None]:
type(df)

In [None]:
df_usecols = pd.read_csv('Lending-company.csv', usecols = ['Product'])
df_usecols.head(5)

In [None]:
type(df_usecols)

In [None]:
data = pd.read_csv('Lending-company.csv', usecols = ['Product'])
df_squeeze = data.squeeze('columns')
df_squeeze.head()

In [None]:
type(df_squeeze)

In [None]:
data = pd.read_csv('Lending-company.csv', usecols = ['Product'])
df_squeeze = data.squeeze('rows')
df_squeeze.head()

In [None]:
type(df_squeeze)

In [None]:
# Some Violent Confusion :>

data = pd.read_csv('Lending-company.csv', usecols = ['Product'])
df_squeeze_0 = data.squeeze('columns')
df_squeeze_1 = data.squeeze('rows')
df_squeeze_1.head()
df_squeeze_0.head()

# Exporting(Saving) Data From A Python Object into a Text File. (using Pandas)

In [None]:
filename = 'Lending-company.csv'
myData = pd.read_csv(filename)
myData.head(5)

In [None]:
# exporting in the form of a csv itself.
myData.to_csv('exported-csv-file.csv') 

In [None]:
# exporting in the form of a json file
myData.to_json('exported-json-file.json')

In [None]:
# exporting in the form of a excel file
myData.to_excel('exported-excel-file.xlsx')

In [None]:
# exporting in the form of a excel file
# removing the default index
myData.to_excel('exported-excel-file.xlsx', index = False) 

# Exporting(Saving) Data From A Python Object into a Text File. (using NumPy)

## np.save()

In [None]:
# package already imported

In [36]:
lending_co = np.genfromtxt('Lending-Company-Saving.csv', 
                           delimiter = ',', 
                           dtype = np.str_)
print(lending_co)

[['LoanID' 'StringID' 'Product' ... 'Location' 'Region' 'TotalPrice']
 ['1' 'id_1' 'Product B' ... 'Location 2' 'Region 2' '16600.0']
 ['2' 'id_2' 'Product B' ... 'Location 3' '' '16600.0']
 ...
 ['1041' 'id_1041' 'Product B' ... 'Location 23' 'Region 4' '16600.0']
 ['1042' 'id_1042' 'Product C' ... 'Location 52' 'Region 6' '15600.0']
 ['1043' 'id_1043' 'Product B' ... 'Location 142' 'Region 6' '16600.0']]


In [37]:
np.save('Lending-Company-Saving', lending_co)  # creates an 'file-name.npy' file in the same dir. as our n/b.  # this saves

In [40]:
lending_data_save = np.load('Lending-Company-Saving.npy') # this loads. (does not import just loads)

In [41]:
print(lending_data_save)

[['LoanID' 'StringID' 'Product' ... 'Location' 'Region' 'TotalPrice']
 ['1' 'id_1' 'Product B' ... 'Location 2' 'Region 2' '16600.0']
 ['2' 'id_2' 'Product B' ... 'Location 3' '' '16600.0']
 ...
 ['1041' 'id_1041' 'Product B' ... 'Location 23' 'Region 4' '16600.0']
 ['1042' 'id_1042' 'Product C' ... 'Location 52' 'Region 6' '15600.0']
 ['1043' 'id_1043' 'Product B' ... 'Location 142' 'Region 6' '16600.0']]


In [None]:
np.array_equal(lending_data_save, lending_co)

## np.savez()

In [None]:
lending_co = np.genfromtxt('Lending-Company-Saving.csv', 
                           delimiter = ',', 
                           dtype = np.str_)
lending_data_save = np.load('Lending-Company-Saving.npy') 
# we loaded the file. 

In [42]:
np.savez('Lending-Company-Saving', lending_co, lending_data_save)  
# np.savez does not create a .npy file it creates a .npz file
# Like an archive of multiple arrays that can store multiple .npy files.

In [44]:
lending_data_savez = np.load('Lending-Company-Saving.npz')

In [48]:
print(lending_data_savez["arr_0"])

[['LoanID' 'StringID' 'Product' ... 'Location' 'Region' 'TotalPrice']
 ['1' 'id_1' 'Product B' ... 'Location 2' 'Region 2' '16600.0']
 ['2' 'id_2' 'Product B' ... 'Location 3' '' '16600.0']
 ...
 ['1041' 'id_1041' 'Product B' ... 'Location 23' 'Region 4' '16600.0']
 ['1042' 'id_1042' 'Product C' ... 'Location 52' 'Region 6' '15600.0']
 ['1043' 'id_1043' 'Product B' ... 'Location 142' 'Region 6' '16600.0']]


In [49]:
np.savez('Lending-Company-Saving', company = lending_co, data_save = lending_data_save)

In [51]:
lending_data_savez = np.load('Lending-Company-Saving.npz')

In [53]:
lending_data_savez.files 
# now we don't need to access the files using arr_0 or arr_1 and so on. Instead we can use the file names we gave above.

['company', 'data_save']

In [54]:
print(lending_data_savez["company"])

[['LoanID' 'StringID' 'Product' ... 'Location' 'Region' 'TotalPrice']
 ['1' 'id_1' 'Product B' ... 'Location 2' 'Region 2' '16600.0']
 ['2' 'id_2' 'Product B' ... 'Location 3' '' '16600.0']
 ...
 ['1041' 'id_1041' 'Product B' ... 'Location 23' 'Region 4' '16600.0']
 ['1042' 'id_1042' 'Product C' ... 'Location 52' 'Region 6' '15600.0']
 ['1043' 'id_1043' 'Product B' ... 'Location 142' 'Region 6' '16600.0']]


In [55]:
print(lending_data_savez["data_save"]) 

[['LoanID' 'StringID' 'Product' ... 'Location' 'Region' 'TotalPrice']
 ['1' 'id_1' 'Product B' ... 'Location 2' 'Region 2' '16600.0']
 ['2' 'id_2' 'Product B' ... 'Location 3' '' '16600.0']
 ...
 ['1041' 'id_1041' 'Product B' ... 'Location 23' 'Region 4' '16600.0']
 ['1042' 'id_1042' 'Product C' ... 'Location 52' 'Region 6' '15600.0']
 ['1043' 'id_1043' 'Product B' ... 'Location 142' 'Region 6' '16600.0']]


In [57]:
np.array_equal(lending_data_savez["company"], lending_data_savez["data_save"])
# they're both the same. 

True

## np.savetxt()

In [59]:
lending_co = np.genfromtxt('Lending-Company-Saving.csv', 
                           delimiter = ',', 
                           dtype = np.str_)

In [61]:
np.savetxt('Lending-Company-Saving.txt', 
            lending_co, 
            fmt = '%s',
            delimiter = ',') 

In [62]:
lending_data_savetxt = np.genfromtxt('Lending-Company-Saving.txt', 
                                     delimiter = ',',
                                     dtype = np.str_)
print(lending_data_savetxt)

[['LoanID' 'StringID' 'Product' ... 'Location' 'Region' 'TotalPrice']
 ['1' 'id_1' 'Product B' ... 'Location 2' 'Region 2' '16600.0']
 ['2' 'id_2' 'Product B' ... 'Location 3' '' '16600.0']
 ...
 ['1041' 'id_1041' 'Product B' ... 'Location 23' 'Region 4' '16600.0']
 ['1042' 'id_1042' 'Product C' ... 'Location 52' 'Region 6' '15600.0']
 ['1043' 'id_1043' 'Product B' ... 'Location 142' 'Region 6' '16600.0']]


In [63]:
lending_data_save = np.load('Lending-Company-Saving.npy')

In [64]:
np.array_equal(lending_data_savetxt, lending_data_save)

True

# Working with Text Data and Argument Specifiers.

In [6]:
product_category = 'B'
print('This item is from Product Category %s.' % product_category) 

This item is from Product Category B.


In [9]:
product_category = ['A', 'B']
print('This item is from Product Category %s.' % product_category[0]) 

This item is from Product Category A.


In [11]:
quantities = [500, 600]
print('This item is of quantity: %d.' % quantities[1])

This item is of quantity: 600.


In [13]:
stock_share_price_list = [40.50, 60.35]
print('This stock costs $%.2f per share.' % stock_share_price_list[0])

# why dollar here?

This stock costs $40.50 per share.


In [16]:
i = 0
print('Currently we have %d units of category %s products in store.' % (quantities[i], product_category[i]))

Currently we have 500 units of category A products in store.


## Manipulating Python Strings.

In [22]:
print('this product is from \ncategory "A".')

this product is from 
category "A".


In [25]:
print('this product is from category "A".' + '\t this product is from category "B".')

this product is from category "A".	 this product is from category "B".


In [21]:
print('this product is from category "A". \rProduct 01')

Product 01ct is from category "A". 


## Python String Methods 

In [26]:
s = 'Price per unit'
s

'Price per unit'

In [27]:
s1 = s.replace("Price", "Cost")
s1

'Cost per unit'

In [28]:
s1.startswith('Cost')

True

In [30]:
s1.endswith('it')

# does not need to be an entire word.

True

In [31]:
s1

'Cost per unit'

In [32]:
s1.split()

['Cost', 'per', 'unit']

In [33]:
s1.split(' ')

['Cost', 'per', 'unit']

In [35]:
s1.split('per')

['Cost ', ' unit']

In [36]:
s1.split(' ', maxsplit = 0)

['Cost per unit']

In [37]:
s2 = 'Mr., John, Wilson'

In [38]:
s2.split(',')

['Mr.', ' John', ' Wilson']

In [39]:
s2.split(',')[0]

'Mr.'

In [40]:
s2.split(',')[1]

' John'

In [42]:
s3 = 'Mrs. Amy Moore'
s3

'Mrs. Amy Moore'

In [43]:
s3.upper()

'MRS. AMY MOORE'

In [44]:
s3.title()

'Mrs. Amy Moore'

In [45]:
s3 == s3.title()
s3

'Mrs. Amy Moore'

In [46]:
s4 = '  quarterly earnings report  '

In [47]:
s4.strip()

'quarterly earnings report'

In [48]:
s4.strip(' ')

'quarterly earnings report'

In [50]:
s4.strip(' r')

'quarterly earnings report'

In [53]:
s4.strip(' qt ')

'uarterly earnings repor'

In [54]:
s4.lstrip()

'quarterly earnings report  '

In [55]:
s4.rstrip()

'  quarterly earnings report'

In [56]:
s4.strip(' quarterly')

'nings repo'

In [57]:
s4.lstrip(' quaterly')

'nings report  '

In [59]:
# the main functionality of the strip method is as follows...

s5 = '%#$$! Quarterly Earnings Report'
s5

'%#$$! Quarterly Earnings Report'

In [60]:
s5.strip(' %#$$!') 

'Quarterly Earnings Report'

## String Accessors.

In [2]:
import pandas as pd

In [3]:
operational_kpis = pd.Series(['Employee Satisfaction Rate', 'Employee Churn Rate'])
operational_kpis

0    Employee Satisfaction Rate
1           Employee Churn Rate
dtype: object

In [7]:
operational_kpis[0].lstrip('Employee ')

'Satisfaction Rate'

In [8]:
operational_kpis[1].lstrip('Employee ')

'Churn Rate'

In [10]:
pd.Series([operational_kpis[0].lstrip('Employee'), operational_kpis[1].lstrip('Employee')])

0     Satisfaction Rate
1            Churn Rate
dtype: object

In [11]:
operational_kpis.str

<pandas.core.strings.accessor.StringMethods at 0x1a63b438d60>

In [14]:
operational_kpis.str.lstrip('Employee ')

0    Satisfaction Rate
1           Churn Rate
dtype: object

In [16]:
test_series = pd.Series(['Test Data', 34])
test_series

0    Test Data
1           34
dtype: object

In [18]:
test_series.str.lstrip('Test ')

0    Data
1     NaN
dtype: object

In [19]:
test_series = pd.Series(['Test Data', '34'])
test_series

0    Test Data
1           34
dtype: object

In [20]:
test_series.str.lstrip('Test ')

0    Data
1      34
dtype: object

In [21]:
house_prices = pd.Series(['$400,000', '$500,000', '$600,000'])
house_prices

0    $400,000
1    $500,000
2    $600,000
dtype: object

In [22]:
house_prices.str.contains('$')

0    True
1    True
2    True
dtype: bool

## Using .format() Methods

In [23]:
time_horizon = 1, 3, 12
time_horizon

(1, 3, 12)

In [24]:
products = ['Product A', 'Product B']
products

['Product A', 'Product B']

In [27]:
'Expected sales for a period of {} month(s) for {}.'.format(time_horizon[2], products[0])

'Expected sales for a period of 12 month(s) for Product A.'

In [28]:
# types of parameters = positional, keyword.

In [29]:
'Expected sales for a period of {0} month(s) for {1}.'.format(time_horizon[2], products[0])

'Expected sales for a period of 12 month(s) for Product A.'

In [30]:
'Expected sales for a period of {1} month(s) for {0}.'.format(time_horizon[2], products[0])

'Expected sales for a period of Product A month(s) for 12.'

In [35]:
'Expected sales for a period of {t_hor[1]} month(s) for {prod[0]}.'.format(t_hor= time_horizon, prod= ['Product B', 'Product A'])

'Expected sales for a period of 3 month(s) for Product B.'

# Must Know Python Tools.

## Iterating Over Range Objects. 

In [36]:
t = (4, 5, 6, 7)
t

(4, 5, 6, 7)

In [37]:
l = [1, 3, 5, 7]
l

[1, 3, 5, 7]

In [38]:
s = 'Diksha'
s

'Diksha'

In [39]:
for i in t:
    print(i, end = ' ')

4 5 6 7 

In [40]:
for i in l:
    print(i, end = ' ')

1 3 5 7 

In [41]:
for i in s:
    print(i, end = ' ')

D i k s h a 

In [42]:
range(5)

range(0, 5)

In [43]:
for i in range(5):
    print(i, end = ' ')

0 1 2 3 4 

#### Nested Loops

In [44]:
for i in range(2):
    print(i)

0
1


In [46]:
for j in range(5):
    print(j)

0
1
2
3
4


In [47]:
for i in range(2):
    for j in range(5):
        print([i, j])

[0, 0]
[0, 1]
[0, 2]
[0, 3]
[0, 4]
[1, 0]
[1, 1]
[1, 2]
[1, 3]
[1, 4]


In [49]:
for i in ['Product A', 'Product B']:
    for j in range(5):
        print([i, j])

['Product A', 0]
['Product A', 1]
['Product A', 2]
['Product A', 3]
['Product A', 4]
['Product B', 0]
['Product B', 1]
['Product B', 2]
['Product B', 3]
['Product B', 4]


In [50]:
products = ['Product A', 'Product B']
exp_sales = [1000, 2000, 3000, 4000, 5000]

In [51]:
for i in products:
    for j in exp_sales:
        print([i, j])

['Product A', 1000]
['Product A', 2000]
['Product A', 3000]
['Product A', 4000]
['Product A', 5000]
['Product B', 1000]
['Product B', 2000]
['Product B', 3000]
['Product B', 4000]
['Product B', 5000]


#### Tripple Nested Loops 

In [52]:
products = ['Product A', 'Product B']
exp_sales = [1000, 2000, 3000, 4000, 5000]

In [53]:
time_horizon = (1, 2, 3)

In [59]:
for i in products:
    for j in exp_sales:
        for k in time_horizon:
            print([i, j*k])

['Product A', 1000]
['Product A', 2000]
['Product A', 3000]
['Product A', 2000]
['Product A', 4000]
['Product A', 6000]
['Product A', 3000]
['Product A', 6000]
['Product A', 9000]
['Product A', 4000]
['Product A', 8000]
['Product A', 12000]
['Product A', 5000]
['Product A', 10000]
['Product A', 15000]
['Product B', 1000]
['Product B', 2000]
['Product B', 3000]
['Product B', 2000]
['Product B', 4000]
['Product B', 6000]
['Product B', 3000]
['Product B', 6000]
['Product B', 9000]
['Product B', 4000]
['Product B', 8000]
['Product B', 12000]
['Product B', 5000]
['Product B', 10000]
['Product B', 15000]


In [60]:
# Good naming 

for prod in products:
    for sale in exp_sales:
        for time in time_horizon:
            print('Expected sales for a period of {0} month(s) for {1}: ${sales}'.format(time, prod, sales = sale*time))
#             print([prod, sale*time])

Expected sales for a period of 1 month(s) for Product A: $1000
Expected sales for a period of 2 month(s) for Product A: $2000
Expected sales for a period of 3 month(s) for Product A: $3000
Expected sales for a period of 1 month(s) for Product A: $2000
Expected sales for a period of 2 month(s) for Product A: $4000
Expected sales for a period of 3 month(s) for Product A: $6000
Expected sales for a period of 1 month(s) for Product A: $3000
Expected sales for a period of 2 month(s) for Product A: $6000
Expected sales for a period of 3 month(s) for Product A: $9000
Expected sales for a period of 1 month(s) for Product A: $4000
Expected sales for a period of 2 month(s) for Product A: $8000
Expected sales for a period of 3 month(s) for Product A: $12000
Expected sales for a period of 1 month(s) for Product A: $5000
Expected sales for a period of 2 month(s) for Product A: $10000
Expected sales for a period of 3 month(s) for Product A: $15000
Expected sales for a period of 1 month(s) for Produc

In [61]:
# Writing Nested loops is considered NON-PYTHONIC. To deal with that we have...

#### List Comprehensions

In [1]:
numbers = [1, 14, 4, 5, 7, 100]
numbers

[1, 14, 4, 5, 7, 100]

In [4]:
new_number = []

for n in numbers:
    new_number.append(n*2)

print(new_number)

[2, 28, 8, 10, 14, 200]


In [5]:
new_number = [n*2 for n in numbers]
new_number

[2, 28, 8, 10, 14, 200]

In [6]:
type(new_number)

list

In [14]:
number = []
for i in range(2):
    for j in range(5):
        number.append(i + j)
number

[0, 1, 2, 3, 4, 1, 2, 3, 4, 5]

In [15]:
type(number)

list

In [9]:
number = [i + j for i in range(2) for j in range(5)]
number

[0, 1, 2, 3, 4, 1, 2, 3, 4, 5]

In [16]:
type(number[1])

# obviouslyyyyyy the type of first element

int

In [17]:
number = [[i + j for i in range(2)] for j in range(5)]
number

[[0, 1], [1, 2], [2, 3], [3, 4], [4, 5]]

In [18]:
type(number[1])

list

In [21]:
n = list(range(1, 11))
n

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [22]:
type(n)

list

In [24]:
type(n[0])

int

In [28]:
[num ** 3 for num in range(1, 11) if num % 2 == 0]

[8, 64, 216, 512, 1000]

In [29]:
[num ** 3 for num in range(1, 11) if num % 2 != 0]

[1, 27, 125, 343, 729]

In [30]:
# just for the sake of comparison, following is the non pythonic way of writing this...

In [32]:
for i in range(1, 11):
    if i % 2 != 0:
        print(i ** 3, end = ' ')

1 27 125 343 729 

In [33]:
[num**3 if num % 2 != 0 else 'even' for num in range(1,11) ]

[1, 'even', 27, 'even', 125, 'even', 343, 'even', 729, 'even']

#### Anonymous (Lambda) Functions. 

In [36]:
def raise_to_the_power_of_2(x):
    return x ** 2

In [37]:
raise_to_the_power_of_2(45)

2025

In [38]:
lambda x: x ** 2

<function __main__.<lambda>(x)>

In [43]:
a_better_lambda_function = lambda x: x ** 2

In [44]:
a_better_lambda_function(3)

9

In [46]:
(lambda x: x ** 2)(11)

# even better

121

In [56]:
(lambda x: x ** 2 + x ** 3 - x)(4)

76

In [57]:
sum_xy = lambda x, y: x + y
sum_xy(78, 89)

167

In [58]:
sum_xy = lambda x, y: x + y(x)

In [59]:
sum_xy(1,2)

TypeError: 'int' object is not callable

In [60]:
sum_xy(2)

TypeError: <lambda>() missing 1 required positional argument: 'y'

In [61]:
sum_xy(2, lambda x: x + 2)

6

In [62]:
# first x = 2, and the system solved the first function then...
# the value from above was 4 therefore the second function (x+2)'s output is...
# 6!

In [63]:
# PRACTICE PRACTICE PRACTICE PRACTICE

## Data Gathering and Data Collection. 

In [64]:
# two types of data = primary and secondary. 

In [65]:
# we create primary, 
# we acquire secondary from a third party. 

In [67]:
# Web Scrapping - technique of extracting info. programmatically from websites. (google does this)
# This is not scalable or reliable , requires constant maintainance. 
# legal issues. 
# we did not practice Web Scrapping in this course. 

In [68]:
# APIs - (Application Programming Interfaces), "bridges" of the digital world.
# designed specifically for programmatical data exchange. 
# The workflow and all related systems are streamlined and easy to use. 
# provide data in many formats (generally as a json file or a csv file.)

## APIs - POST and GET requests.

In [69]:
# post not needed in this course, what does that mean!

In [None]:
# rest in API.ipynb