In [None]:
!python --version   # Python version

# About python:  https://www.python.org/
#                Python is powerful... and fast; plays well with others; runs everywhere; is friendly & easy to learn;
#                is Open –> https://www.python.org/about/.
#     Python docs: https://docs.python.org/3/ (all documentation);
#                  https://docs.python.org/3.10/ (Recommended version – 3.10).
# The Python Tutorial (python3.10): https://docs.python.org/3.10//tutorial/index.html

# Load Modules ---
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
# NumPy : The fundamental package for scientific computing with Python. NumPy is the fundamental package for scientific
#         computing in Python. It is a Python library that provides a multidimensional array object, various derived
#         objects (such as masked arrays and matrices), and an assortment of routines for fast operations on arrays,
#         including mathematical, logical, shape manipulation, sorting, selecting, I/O, discrete Fourier transforms,
#         basic linear algebra, basic statistical operations, random simulation and much more.
#     About: https://numpy.org/
#     Docs: https://numpy.org/doc/stable/
#     NumPy quickstart: https://numpy.org/doc/stable/user/quickstart.html

# Pandas: pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool,
#         built on top of the Python programming language.
#     About: https://pandas.pydata.org/
#     Docs: https://pandas.pydata.org/docs/
#     Getting started: https://pandas.pydata.org/docs/getting_started/index.html
#     User Guide: https://pandas.pydata.org/docs/user_guide/index.html#user-guide

print('numpy version:',np.__version__)
print('pandas version: ',pd.__version__)

Python 3.10.12
numpy version: 1.23.5
pandas version:  1.5.3


In [None]:
# make pandas sample series
s=pd.Series([-1, 2, 9, 0.4, np.nan, np.inf])
s

0   -1.0
1    2.0
2    9.0
3    0.4
4    NaN
5    inf
dtype: float64

In [None]:
print('Shape of the data:',s.shape)
print('Size of the data:',s.size)
print('Mean of series:',s.mean())# skipna = True
print('Standard deviation of series:',s.std())
print('Maximum value in series:',s.max())
print('Minimum value in series:',s.min())

Shape of the data: (6,)
Size of the data: 6
Mean of series: inf
Standard deviation of series: nan
Maximum value in series: inf
Minimum value in series: -1.0


In [None]:
# make pandas sample series with user define indexes
s_with_user_define_indexs=pd.Series(
    [-1, 2, 9, 0.4, np.nan, np.inf],
            index=list('python')# or ['p', 'y', 't', 'h', 'o', 'n']
            )
s_with_user_define_indexs

p   -1.0
y    2.0
t    9.0
h    0.4
o    NaN
n    inf
dtype: float64

In [None]:
# make pandas big series from numpy array (1 million values)
series_from_numpy_array=pd.Series(
    np.random.normal(size=1_000_000))
series_from_numpy_array.head()# see first five values (by default)

0   -0.341719
1   -0.588678
2    0.112445
3    1.595582
4    0.265188
dtype: float64

In [None]:
series_from_numpy_array.tail(3)# show last three values (default n = 5)

999997   -0.205402
999998   -0.169132
999999   -0.250578
dtype: float64

In [None]:
print('Shape of the data:',series_from_numpy_array.shape)
print('Size of the data:',series_from_numpy_array.size)
print('Mean of series:',series_from_numpy_array.mean())# skipna = True
print('Standard deviation of series:',series_from_numpy_array.std())
print('Maximum value in series:',series_from_numpy_array.max())
print('Minimum value in series:',series_from_numpy_array.min())

Shape of the data: (1000000,)
Size of the data: 1000000
Mean of series: 0.000765787745633034
Standard deviation of series: 0.9995190943989892
Maximum value in series: 5.189385540321411
Minimum value in series: -4.718500766829448


In [None]:
s_with_user_define_indexs[2]

9.0

In [None]:
s_with_user_define_indexs['t']

9.0

In [None]:
s_with_user_define_indexs.loc['t']

9.0

In [None]:
s_with_user_define_indexs.iloc[2]

9.0

In [None]:
s.loc[2]

9.0

In [None]:
s.iloc[2]

9.0

In [None]:
s[2:5]

2    9.0
3    0.4
4    NaN
dtype: float64

In [None]:
s_with_user_define_indexs['t':'n']

t    9.0
h    0.4
o    NaN
n    inf
dtype: float64

In [None]:
s_with_user_define_indexs[2:5]

t    9.0
h    0.4
o    NaN
dtype: float64

In [None]:
s[2:5]

2    9.0
3    0.4
4    NaN
dtype: float64

In [None]:
s.loc[2:5]

2    9.0
3    0.4
4    NaN
5    inf
dtype: float64

In [None]:
s.iloc[2:5]

2    9.0
3    0.4
4    NaN
dtype: float64

In [None]:
s[::-2]

5    inf
3    0.4
1    2.0
dtype: float64

In [None]:
series_from_numpy_array[3:11:2]

3    1.595582
5    0.633965
7    0.767163
9   -1.460083
dtype: float64

In [None]:
series_from_numpy_array[[12,4,7,8]]

12   -0.549245
4     0.265188
7     0.767163
8     1.202908
dtype: float64

In [None]:
s.info()

<class 'pandas.core.series.Series'>
RangeIndex: 6 entries, 0 to 5
Series name: None
Non-Null Count  Dtype  
--------------  -----  
5 non-null      float64
dtypes: float64(1)
memory usage: 176.0 bytes


In [None]:
series_from_numpy_array.describe()

count    1000000.000000
mean           0.000766
std            0.999519
min           -4.718501
25%           -0.673079
50%           -0.000137
75%            0.674438
max            5.189386
dtype: float64

In [None]:
pd.date_range(start='19/9/2000',end='19/9/2001',freq='m',)

  pd.date_range(start='19/9/2000',end='19/9/2001',freq='m',)


DatetimeIndex(['2000-09-30', '2000-10-31', '2000-11-30', '2000-12-31',
               '2001-01-31', '2001-02-28', '2001-03-31', '2001-04-30',
               '2001-05-31', '2001-06-30', '2001-07-31', '2001-08-31'],
              dtype='datetime64[ns]', freq='M')

In [None]:
pd.date_range(start='19/9/2000',periods=10,freq='m',)

  pd.date_range(start='19/9/2000',periods=10,freq='m',)


DatetimeIndex(['2000-09-30', '2000-10-31', '2000-11-30', '2000-12-31',
               '2001-01-31', '2001-02-28', '2001-03-31', '2001-04-30',
               '2001-05-31', '2001-06-30'],
              dtype='datetime64[ns]', freq='M')

In [None]:
data={
    'name':['c','c++','java','python'],
    'type':['compiler','compiler','compiler','interpreter'],
    'level':['low','low','high','high'],
    'version':[12,17,18,3.10],
    'user-friendly':[False,False,False,True],
    'installed':[None,None,True,False],
    'last-updated':None
}

In [None]:
df=pd.DataFrame(data=data)
df

Unnamed: 0,name,type,level,version,user-friendly,installed,last-updated
0,c,compiler,low,12.0,False,,
1,c++,compiler,low,17.0,False,,
2,java,compiler,high,18.0,False,True,
3,python,interpreter,high,3.1,True,False,


In [None]:
df.columns

Index(['name', 'type', 'level', 'version', 'user-friendly', 'installed',
       'last-updated'],
      dtype='object')

In [None]:
df.index

RangeIndex(start=0, stop=4, step=1)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           4 non-null      object 
 1   type           4 non-null      object 
 2   level          4 non-null      object 
 3   version        4 non-null      float64
 4   user-friendly  4 non-null      bool   
 5   installed      2 non-null      object 
 6   last-updated   0 non-null      object 
dtypes: bool(1), float64(1), object(5)
memory usage: 324.0+ bytes


In [None]:
df.describe()

Unnamed: 0,version
count,4.0
mean,12.525
std,6.809491
min,3.1
25%,9.775
50%,14.5
75%,17.25
max,18.0


In [None]:
df.describe(include=object) #df.describe(include='object')

Unnamed: 0,name,type,level,installed,last-updated
count,4,4,4,2,0.0
unique,4,2,2,2,0.0
top,c,compiler,low,True,
freq,1,3,2,1,


In [None]:
df.describe(include='bool') #df.describe(include=bool)

Unnamed: 0,user-friendly
count,4
unique,2
top,False
freq,3


In [None]:
df['user-friendly']         # get single column (by its name)

0    False
1    False
2    False
3     True
Name: user-friendly, dtype: bool

In [None]:
df.name #df['name']

0         c
1       c++
2      java
3    python
Name: name, dtype: object

In [None]:
df[['user-friendly','name','installed']]

Unnamed: 0,user-friendly,name,installed
0,False,c,
1,False,c++,
2,False,java,True
3,True,python,False


In [None]:
df.loc[3]

name                  python
type             interpreter
level                   high
version                  3.1
user-friendly           True
installed              False
last-updated            None
Name: 3, dtype: object

In [None]:
df.loc[1:3]

Unnamed: 0,name,type,level,version,user-friendly,installed,last-updated
1,c++,compiler,low,17.0,False,,
2,java,compiler,high,18.0,False,True,
3,python,interpreter,high,3.1,True,False,


In [None]:
df.loc[[2,3,0]]

Unnamed: 0,name,type,level,version,user-friendly,installed,last-updated
2,java,compiler,high,18.0,False,True,
3,python,interpreter,high,3.1,True,False,
0,c,compiler,low,12.0,False,,


In [None]:
df.loc[3,'installed']

False

In [None]:
df.iloc[3,5]

False

In [None]:
df.loc[:,'installed']

0     None
1     None
2     True
3    False
Name: installed, dtype: object

In [None]:
df.iloc[:,5]

0     None
1     None
2     True
3    False
Name: installed, dtype: object

In [None]:
df.loc[1:2,'name':'version']

Unnamed: 0,name,type,level,version
1,c++,compiler,low,17.0
2,java,compiler,high,18.0


In [None]:
df.iloc[1:3,0:4]

Unnamed: 0,name,type,level,version
1,c++,compiler,low,17.0
2,java,compiler,high,18.0


In [None]:
df.loc[1:2,['version','level']]

Unnamed: 0,version,level
1,17.0,low
2,18.0,high


In [None]:
df.iloc[1:3,[3,2]]

Unnamed: 0,version,level
1,17.0,low
2,18.0,high


In [None]:
df.loc[[3,1,2],['version','level','name']]

Unnamed: 0,version,level,name
3,3.1,high,python
1,17.0,low,c++
2,18.0,high,java


In [None]:
df.iloc[[3,1,2],[3,2,0]]

Unnamed: 0,version,level,name
3,3.1,high,python
1,17.0,low,c++
2,18.0,high,java


In [None]:
big_array=np.random.normal(size=(10_000,5))
df_big=pd.DataFrame(big_array,columns=list('abcde'))
df_big.head()

Unnamed: 0,a,b,c,d,e
0,0.674623,0.922537,-0.564516,1.224484,-0.80399
1,-0.114957,1.151816,0.067731,0.382998,0.680371
2,0.631263,0.278184,-1.760344,-0.102186,0.270739
3,2.221748,-0.215852,0.266361,-0.114636,0.035511
4,3.752407,-1.810288,0.943687,0.487681,-1.819993


In [None]:
df_big.tail()

Unnamed: 0,a,b,c,d,e
9995,0.342008,0.144935,-1.967588,-0.808602,0.188763
9996,-1.754219,-0.467132,1.229573,0.475106,0.82703
9997,0.668059,-1.366031,-0.28957,-0.247627,-1.738946
9998,-1.487397,-0.344092,-1.01717,-0.873647,-0.343311
9999,0.267993,0.947058,-1.246768,-0.969264,1.52862


In [None]:
df_big.columns

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [None]:
df_big.index

RangeIndex(start=0, stop=10000, step=1)

In [None]:
df_big.shape

(10000, 5)

In [None]:
df_big.size

50000

In [None]:
np.random.choice(df_big.columns)

'c'

In [None]:
np.random.choice(df_big.index)

7047

In [None]:
%timeit df_big.at[np.random.choice(df_big.index),np.random.choice(df_big.columns)]

64.7 µs ± 8.99 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [None]:
%timeit df_big.loc[np.random.choice(df_big.index),np.random.choice(df_big.columns)]

102 µs ± 9.4 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [None]:
df_big.mean(axis=0)# mean of each column (feature-wise)

a    0.000308
b    0.017318
c    0.007060
d   -0.014849
e   -0.001609
dtype: float64

In [None]:
df_big.mean(axis=1)# mean of each row (sample-wise)

0       0.290628
1       0.433592
2      -0.136469
3       0.438626
4       0.310699
          ...   
9995   -0.420097
9996    0.062071
9997   -0.594823
9998   -0.813123
9999    0.105528
Length: 10000, dtype: float64

In [None]:
df_big['a'].mean() # mean of column 'a'

0.00030842632656348225

In [None]:
df_big.loc[3].mean()# mean of 3rd row only

0.4386263702064238

In [None]:
df_big['new-column']=range(10_000)

In [None]:
df_big.head()

Unnamed: 0,a,b,c,d,e,new-column
0,0.674623,0.922537,-0.564516,1.224484,-0.80399,0
1,-0.114957,1.151816,0.067731,0.382998,0.680371,1
2,0.631263,0.278184,-1.760344,-0.102186,0.270739,2
3,2.221748,-0.215852,0.266361,-0.114636,0.035511,3
4,3.752407,-1.810288,0.943687,0.487681,-1.819993,4


In [None]:
df_big.tail()

Unnamed: 0,a,b,c,d,e,new-column
9995,0.342008,0.144935,-1.967588,-0.808602,0.188763,9995
9996,-1.754219,-0.467132,1.229573,0.475106,0.82703,9996
9997,0.668059,-1.366031,-0.28957,-0.247627,-1.738946,9997
9998,-1.487397,-0.344092,-1.01717,-0.873647,-0.343311,9998
9999,0.267993,0.947058,-1.246768,-0.969264,1.52862,9999


In [None]:
df_big['new-column']=pd.date_range(start='2000/9/19',
                                   periods=10_000,freq='H')

In [None]:
df_big.head()

Unnamed: 0,a,b,c,d,e,new-column
0,0.674623,0.922537,-0.564516,1.224484,-0.80399,2000-09-19 00:00:00
1,-0.114957,1.151816,0.067731,0.382998,0.680371,2000-09-19 01:00:00
2,0.631263,0.278184,-1.760344,-0.102186,0.270739,2000-09-19 02:00:00
3,2.221748,-0.215852,0.266361,-0.114636,0.035511,2000-09-19 03:00:00
4,3.752407,-1.810288,0.943687,0.487681,-1.819993,2000-09-19 04:00:00


In [None]:
%timeit df_big[np.random.choice(df_big.columns)].sort_values()

1.04 ms ± 95.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [None]:
numpy_array=df_big.to_numpy() # convert pandas dataframe to numpy array
print('Type of object:',type(numpy_array))
print('Shape of object:',numpy_array.shape)
print('Size of object:',numpy_array.size)

Type of object: <class 'numpy.ndarray'>
Shape of object: (10000, 6)
Size of object: 60000


In [None]:
numpy_array=df_big.values # convert pandas dataframe to numpy array

In [None]:
df.to_csv('dataframe.csv')

In [None]:
df_from_csv=pd.read_csv('dataframe.csv')
df_from_csv

In [None]:
df_from_csv.drop(columns='Unnamed: 0',inplace=False)

In [None]:
df_from_csv=pd.read_csv('dataframe.csv',index_col=0)
df_from_csv

In [None]:
df.to_csv('dataframe-without-indexs.csv',index=False)

In [None]:
df.to_csv('dataframe.tsv',sep='\t')

In [None]:
df_from_csv=pd.read_csv('dataframe.tsv',sep='\t',index_col=0)
df_from_csv

In [None]:
df.to_excel('dataframe.xlsx')

In [None]:
df_from_excel=pd.read_excel('dataframe.xlsx',index_col=0)
df_from_excel

Unnamed: 0,name,type,level,version,user-friendly,installed,last-updated
0,c,compiler,low,12.0,False,,
1,c++,compiler,low,17.0,False,,
2,java,compiler,high,18.0,False,1.0,
3,python,interpreter,high,3.1,True,0.0,


# References

* [The Python Tutorial](https://docs.python.org/3.10/tutorial/index.html)
* [NumPy quickstart](https://numpy.org/doc/stable/user/quickstart.html)
* [Matplotlib's Quick start guide](https://matplotlib.org/stable/users/explain/quick_start.html)
* [Pyplot tutorial](https://matplotlib.org/stable/tutorials/pyplot.html#sphx-glr-tutorials-pyplot-py)
* [Pyplot - image tutorial](https://matplotlib.org/stable/tutorials/images.html#sphx-glr-tutorials-images-py)
* [Matplotlib - All Tutorials](https://matplotlib.org/stable/tutorials/index.html)
* [10 Minutes to Pandas (Pandas, official tutorial)](https://pandas.pydata.org/docs/user_guide/10min.html)