# Introduction to Python

In [49]:
# Modules, Libraries
import pandas as pd
import numpy as np
import datetime
import os
import sklearn
import truc

In [50]:
# Import personal module
truc.truc()

'truc'

In [51]:
# How to use a standard module
## Measure execution time
t_start = datetime.datetime.now()

for i in range(int(10e6)):
    i**4

t_end = datetime.datetime.now()

print(t_end-t_start)

0:00:02.270978


## Basic types / structures

In [None]:
# Text and numbers
12         # int (integer)
1.5        # float
'hola'     # str (string)
"hola"
"""hola"""

# Iterables
[42, 58, 209, 42]  # list
(42, 58, 209, 42)  # tuple
{42, 58, 209}      # set
{'name': ['akiko', 'julie'], 'age': [12, 43]}  # dict (dictionary)

### Dictionaries

In [52]:
# Different (legal as to PEP8) ways to create the dictionary
dico = {'surname': ['jp', 'pa', 'ma', 'fo'], 'name': ['aba', 'chd', 'oih', 'iouh'], 'age': [12, 23, 40, 8]}

dico = {'surname': ['jp', 'pa', 'ma', 'fo'],
        'name': ['aba', 'chd', 'oih', 'iouh'],
        'age': [12, 23, 40, 8]}

dico = {
    'surname': ['jp', 'pa', 'ma', 'fo'],
    'name': ['aba', 'chd', 'oih', 'iouh'],
    'age': [12, 23, 40, 8],
}

#### Extract info from dictionary

In [53]:
dico['surname']

['jp', 'pa', 'ma', 'fo']

In [54]:
dico['name']

['aba', 'chd', 'oih', 'iouh']

In [55]:
dico.items()

dict_items([('surname', ['jp', 'pa', 'ma', 'fo']), ('name', ['aba', 'chd', 'oih', 'iouh']), ('age', [12, 23, 40, 8])])

In [56]:
dico.keys()

dict_keys(['surname', 'name', 'age'])

In [57]:
dico.values()

dict_values([['jp', 'pa', 'ma', 'fo'], ['aba', 'chd', 'oih', 'iouh'], [12, 23, 40, 8]])

In [58]:
for key in dico:
    print(dico[key][1])

for key in dico.keys():
    print(dico[key][1])

pa
chd
23
pa
chd
23


In [59]:
for val in dico['age']:
    print(val)

12
23
40
8


#### Add info to dicotionary

In [60]:
# Add value to existing key
dico['age'].append(43)

In [61]:
dico

{'surname': ['jp', 'pa', 'ma', 'fo'],
 'name': ['aba', 'chd', 'oih', 'iouh'],
 'age': [12, 23, 40, 8, 43]}

We observe that lists of the dictionary do not need to be of same length.

Actually, we could add any object in each value

In [62]:
dico['country'] = 'a'

In [63]:
dico

{'surname': ['jp', 'pa', 'ma', 'fo'],
 'name': ['aba', 'chd', 'oih', 'iouh'],
 'age': [12, 23, 40, 8, 43],
 'country': 'a'}

In [64]:
# Replace existing key/values
dico['country'] = ['fr', 'gb', 'us', 'es']

In [65]:
dico

{'surname': ['jp', 'pa', 'ma', 'fo'],
 'name': ['aba', 'chd', 'oih', 'iouh'],
 'age': [12, 23, 40, 8, 43],
 'country': ['fr', 'gb', 'us', 'es']}

In [66]:
dico['objects'] = [truc.truc]

In [67]:
dico

{'surname': ['jp', 'pa', 'ma', 'fo'],
 'name': ['aba', 'chd', 'oih', 'iouh'],
 'age': [12, 23, 40, 8, 43],
 'country': ['fr', 'gb', 'us', 'es'],
 'objects': [<function truc.truc()>]}

In [68]:
# Chaining operations
print(dico['objects'])  # but is within parenthesis
print(dico['objects'][0])  # extract the first (and only) element of the list
print(dico['objects'][0]())  # execute the function (ref)

[<function truc at 0x15c731670>]
<function truc at 0x15c731670>
truc


In [69]:
truc.truc

<function truc.truc()>

## Operators

Class methods:
- Greater than \_\_gt\_\_
- Lower than \_\_lt\_\_ 

In [138]:
print(1 > 2)
print(1 < 2)
print(1 < 1)
print('a' < 'b')
print('a' > 'b')
print([1] < [2, 3])
print([1] > [2, 3])

False
True
False
True
False
True
False


Class methods:
- Greater than or equal to \_\_ge\_\_
- Lower than or equal to \_\_le\_\_ 

In [106]:
print(1 >= 2)
print(1 <= 2)
print(1 <= 1)

False
True
True


Class methods:
- Equal to \_\_eq\_\_
- Different from \_\_ne\_\_ 

In [107]:
print(1 == 2)
print(1 != 2)
print(1 == 1)
print(1 != 1)

False
True
True
False


Class method:
- Find element in object \_\_contains\_\_

Available in iterables, not in numbers.

In [128]:
print(1 in [1, 4, 2])
print(1 in [4, 2])
print([1] in [1, 4, 2])
print([1] in [[1], 4, 2])
print('a' in 'oisj')
print('a' in 'oiasj')

True
False
False
True
False
True


Not a class method:
- Check if element is (True, False, None, np.nan)

In [157]:
print(True is True)
print(True is False)

print([1] is None)
x = 12
print(x is None)
x = None
print(x is None)

import numpy as np
print(x is np.nan)

True
False
False
False
True
False


Negation operator:
- not
- ~

In [None]:
print(1 == 1)
print(not 1 == 1)
print(~ 1 == 1)
print(1 != 1)
print(not 1 != 1)
print(~ 1 != 1)

True
False
False
False
True
True


View an object internal methods

In [130]:
dir(3)
# dir([1, 2, 3])

['__abs__',
 '__add__',
 '__and__',
 '__bool__',
 '__ceil__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__float__',
 '__floor__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getnewargs__',
 '__gt__',
 '__hash__',
 '__index__',
 '__init__',
 '__init_subclass__',
 '__int__',
 '__invert__',
 '__le__',
 '__lshift__',
 '__lt__',
 '__mod__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__rdivmod__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rfloordiv__',
 '__rlshift__',
 '__rmod__',
 '__rmul__',
 '__ror__',
 '__round__',
 '__rpow__',
 '__rrshift__',
 '__rshift__',
 '__rsub__',
 '__rtruediv__',
 '__rxor__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 '__truediv__',
 '__trunc__',
 '__xor__',
 'as_integer_ratio',
 'bit_length',
 'conjugate',
 'denominator',
 'from_bytes',
 'imag',
 'numerator',
 'real',
 'to_bytes']

All these operators return boolean values (True / False). In Pandas, this will help us build "masks" (filters)

## Conditions

Simple "if, else" condition

In [147]:
x = 3

if x == 3:
    print("Yes, x is equal to 3")
else:
    print("No, x is not equal to 3")

Yes, x is equal to 3


If is not None

In [148]:
if x != None:
    print(x)
else:
    print("Not possible")

3


Better way to write, if is not None

In [149]:
if x:
    print(x)
else:
    print("Not possible")

3


## Control Structures

for loop

In [158]:
for i in [2, 54, 39]:
    print(i)

2
54
39


## Functions

Functions can:
- take no, to many arguments
- arguments can be "Positional" or "Keyword" arguments
- return nothing (None) or anything (to many things)
- synonyms: arguments / parameters / inputs

One must:
- Define the function
- Call the function

Simplest form:
- No argument required
- No return

In [163]:
# Define the function
def say_something():
    print("Something")

# Call the function
say_something()

Something


Function with:
- an argument
- no return

In [159]:
def say_my_name(name):
    print(name)

say_my_name("Alexis")

Alexis


Function with:
- an argument
- a return

In [166]:
def square(x):
    return x**2

result = square(4)
result

16

Function with:
- multiple arguments
- a return

In [167]:
def add(a, b, c):
    return a + b + c

result = add(4, 2, 9)
result

15

Function with:
- no argument
- multiple returns

In [168]:
def return_many_things():
    return 'alexis', 'bogroff', 'data'

return_many_things()

('alexis', 'bogroff', 'data')

In [169]:
# Get the result in separate variables
r_1, r_2, r_3 = return_many_things()
print(r_1)
print(r_2)
print(r_3)

alexis
bogroff
data


Function with:
- a keyword argument
    - is thus optional
    - must be positioned after the positional arguments
- no return

In [171]:
def say_what_you_doing(name, course='data'):
    print(f"{name} doing {course}")

say_what_you_doing("Alexis")
say_what_you_doing("Alexis", "writing the course")

Alexis doing data
Alexis doing writing the course


Function with:
- a (positional) argument awaiting a function
- no return

In [176]:
def complex_fct(func):
    print("This function will say")
    func()

complex_fct(say_something)

This function will say
Something


## Pandas

### Import csv (data)

In [70]:
path_relative = "examples/TD6_Exam_MCQ.csv"  # relative
path_absolute = "/Users/alexisbogroff/git_repositories/Sorbonne_Data_Science_Workshop/src/examples/TD6_Exam_MCQ.csv" # absolute
df = pd.read_csv(path_relative)
df.head()

Unnamed: 0,Question,Answer1,True?,2,True?.1,3,True?.2,4,True?.3,5,...,6,True?.5,7,True?.6,8,True?.7,9,True?.8,10,True?.9
0,,,,,,,,,,,...,,,,,,,,,,
1,Which Operating System has the biggest Persona...,Microsoft Windows,X,Apple macOS,,Unix - Linux,,,,,...,,,,,,,,,,
2,Which Operating System has the biggest Smartph...,Microsoft Windows,,Apple macOS,,Unix - Linux,X,,,,...,,,,,,,,,,
3,"For Unix creators, human efficiency is more va...",TRUE,X,FALSE,,,,,,,...,,,,,,,,,,
4,"Before Personal Computer era, human efficiency...",TRUE,,FALSE,X,,,,,,...,,,,,,,,,,


In [71]:
type(df)

pandas.core.frame.DataFrame

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53 entries, 0 to 52
Data columns (total 21 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Question  48 non-null     object 
 1   Answer1   48 non-null     object 
 2   True?     17 non-null     object 
 3   2         48 non-null     object 
 4   True?.1   17 non-null     object 
 5   3         43 non-null     object 
 6   True?.2   13 non-null     object 
 7   4         39 non-null     object 
 8   True?.3   12 non-null     object 
 9   5         32 non-null     object 
 10  True?.4   6 non-null      object 
 11  6         19 non-null     object 
 12  True?.5   2 non-null      object 
 13  7         11 non-null     object 
 14  True?.6   1 non-null      object 
 15  8         1 non-null      object 
 16  True?.7   0 non-null      float64
 17  9         0 non-null      float64
 18  True?.8   0 non-null      float64
 19  10        0 non-null      float64
 20  True?.9   0 non-null      float64


### Filter (extract) data from DataFrame (Pandas)

In [73]:
# Get line 2
print(type(df.iloc[1]))
df.iloc[1]

<class 'pandas.core.series.Series'>


Question    Which Operating System has the biggest Persona...
Answer1                                     Microsoft Windows
True?                                                       X
2                                                 Apple macOS
True?.1                                                   NaN
3                                                Unix - Linux
True?.2                                                   NaN
4                                                         NaN
True?.3                                                   NaN
5                                                         NaN
True?.4                                                   NaN
6                                                         NaN
True?.5                                                   NaN
7                                                         NaN
True?.6                                                   NaN
8                                                         NaN
True?.7 

In [74]:
# Get line 2 without NAs
df.iloc[1][~df.iloc[1].isna()]

Question    Which Operating System has the biggest Persona...
Answer1                                     Microsoft Windows
True?                                                       X
2                                                 Apple macOS
3                                                Unix - Linux
Name: 1, dtype: object

In [75]:
# Do the same in multiple steps (more comprehensible)
line_2 = df.iloc[1]  # extract => get a pandas Series
mask_isna = line_2.isna()  # mask => True & False
line_2_no_na = line_2[~mask_isna]  # ~ => Get all BUT what is True

In [76]:
line_2_no_na

Question    Which Operating System has the biggest Persona...
Answer1                                     Microsoft Windows
True?                                                       X
2                                                 Apple macOS
3                                                Unix - Linux
Name: 1, dtype: object

## Lists comprehension

Efficient looping (since computed in C++)

In [77]:
l = []
for i in range(50000000):
    l.append(i**2)

In [78]:
# As long as the instruction is simple enough
# Do the following which is faster
l = [i**2 for i in range(50000000)]

In [79]:
l = []
for i in range(int(10e6)):
    if i > 10 and i < 20:
        l.append(i)
    if i > 20 and i < 50:
        l.append(0)
    else:
        l.append(i)

## Lambda functions

In [80]:
def mult_11(x):
    return x * 11

In [81]:
mult_11(4)

44

In [82]:
# Create a function in one-line (don't do this alone in a line)
mult_12 = lambda x: x * 12

In [83]:
mult_12(4)

48

When is it useful? When you have one-time use functions

In [84]:
df = pd.DataFrame({'letters':['a', 'b', 'c']})

In [85]:
df.apply(lambda x: x.str.upper())

Unnamed: 0,letters
0,A
1,B
2,C


In [86]:
# But useless here since we have a simpler solution
df['letters'].str.upper()

0    A
1    B
2    C
Name: letters, dtype: object

In [87]:
df['letters'].apply(lambda x: x.upper() if x=='a' else x)

0    A
1    b
2    c
Name: letters, dtype: object

Vectorised loop:
- way more effient on large data
- good practice

In [88]:
# Use .apply with a pre defined function
df['letters'].apply(mult_12)

# Functions should take only 1 argument as input.

0    aaaaaaaaaaaa
1    bbbbbbbbbbbb
2    cccccccccccc
Name: letters, dtype: object

## Objects

### Create simple class

In [89]:
class Truc:

    # Define instanciator (init)
    def __init__(self):
        self.age = 10
        self.name = 'truc'


In [90]:
truc_1 = Truc()

In [91]:
truc_1

<__main__.Truc at 0x15c93d700>

In [92]:
print(truc_1.name)
print(truc_1.age)


truc
10


### Create first method

In [93]:
class Truc:

    # Define instanciator (init)
    def __init__(self):
        self.age = 10
        self.name = 'truc'

    def present(self):
        print(f"My name is: {self.name}, I'm {self.age} years old")

In [94]:
truc_1 = Truc()

In [95]:
truc_1

<__main__.Truc at 0x15c93d070>

In [96]:
truc_1.name

'truc'

In [97]:
truc_1.present()

My name is: truc, I'm 10 years old


### Create a method with a return

In [98]:
class Truc:

    # Define instanciator (init)
    def __init__(self):
        self.age = 10
        self.name = 'truc'

    def present(self):
        print(f"My name is: {self.name}, I'm {self.age} years old")

    def dog_age(self):
        return self.age * 7

In [99]:
truc_1 = Truc()

In [100]:
dog_age = truc_1.dog_age()

In [101]:
dog_age

70

#### Chaining operations

In [102]:
truc_1.name.upper()

'TRUC'

In [103]:
name = truc_1.name
name.upper()

'TRUC'