In [8]:
import numpy as np
import pandas as pd

from testing_data import *
import data_types as types
import df_mappings as map
import text_exploration as texplo

# 1. Data Types

In [None]:
# Print out the names of all the functions in the data_types module
print([item for item in dir(types) if not item.startswith('__')])

['col_types', 'nan2nan', 'pd']


### 1.1 Explore types with `col_types()`
`types.col_types(df)` prints out the number of element types found in each column of a given pandas Data Frame:

In [None]:
# help(types.col_types)

Help on function col_types in module data_types:

col_types(df)
    '
    In: a data frame df
    Out: prints the number of element types found in each column of a given df



In [42]:
print(df_multitype,'\n')

types.col_types(df_multitype)

   Column1 Column2 Column3
0      1.0       A    True
1      2.5       B   False
2      3.0       C     NaN
3      4.8       D       1
4      5.0       5   Mixed 

Column1
<class 'float'>    5
Name: count, dtype: int64 
 NaN values:  0 

Column2
<class 'str'>    4
<class 'int'>    1
Name: count, dtype: int64 
 NaN values:  0 

Column3
<class 'bool'>     2
<class 'float'>    1
<class 'int'>      1
<class 'str'>      1
Name: count, dtype: int64 
 NaN values:  1 



### 1.2 Explore Same-Row NaNs with `nan2nan`
`types.nan2nan` serves to answer the question: Are NaN values in col_1 also NaN in col 2?

In [None]:
# help(types.nan2nan)

Help on function nan2nan in module data_types:

nan2nan(df, col_1_name, col_2_name)
    Given 2 columns in a data frame df, this function
    returns a list with the number of NaNs in the first,
    The number of NaNs in the second,
    And the number fo rows where both cols have NaN value



In [41]:
print(df_nans,'\n')

types.nan2nan(df_nans, 'Column1', 'Column2')

   Column1 Column2 Column3
0      1.0       A    True
1      NaN       B     NaN
2      NaN     NaN     NaN
3      4.0     NaN    True
4      5.0       E   False 

NaN values in Column1:  2 

NaN values in Column2:  2 

NaN values in the same row for both Column1 and Column2:  1


# 2. Mappings

In [None]:
# Print out the names of all the functions in the df_mappings module
print([item for item in dir(map) if not item.startswith('__')])

['foreign_k2k', 'map_col2col', 'occurrences', 'pd', 'xs2xs', 'xs2ys']


### 2.1 Which values are in the same position?
`maps.map_col2col`: given 2 columns in a data frame (or any two same length columns), one might be interested in how they map onto each other. Is it a 1 to 1 mapping? Does it define a function? And if so, is the function injective of surjective?

The following function, given col_1 and col_2, returns, for each value in col_1, how many values it is associated to in col_2:

In [None]:
# help(map.map_col2col)

Help on function map_col2col in module df_mappings:

map_col2col(series_1, series_2)



In [None]:
# Try it out here:
print(df_maps1, '\n')
print(map_col2col(df_maps1.b, df_maps1.c))

   a  b      c
0  1  1  lollo
1  2  3  gigio
2  3  1  lollo
3  4  3  gigio
4  3  1  lollo 

   gigio  lollo
1  False   True
3   True  False


By calling `map_count(col_1, col_2)` and then `map_count(col_2, col_1)`, one will be able to determine the relation between the two columns.

### 2.2 Can 2 same type series get a 1 to 1 mapping?
`map.foreign_k2k`: are you wandering whether two columns from different tables have the same unique values?

In [None]:
# help(map.foreign_k2k)

Help on function foreign_k2k in module df_mappings:

foreign_k2k(series_1, series_2, df1_name='table1', df2_name='table2', n_matches=-1, mode='>')



In [9]:

map.foreign_k2k(df_maps2.a, df_maps2.b)

Unnamed: 0,table1.a_values,matches_in_table2.b > -1
0,1,2
1,2,0
2,3,1
3,4,0


### 2.3 xs2xs

In [69]:
iter_1 = [1, 2, 3, 4, 5, 6, np.nan, np.nan, 3, 3]
iter_2 = [1, 5, 3, 3, 4, 5, 8, np.nan]

i1 = pd.Series(iter_1)
i2 = pd.Series(iter_2)

In [None]:
i1.isna().sum()

np.int64(4)

In [17]:
pd.isna(i1[6])

True

In [None]:
r = pd.DataFrame()
# Assigning names to the columns
values = 'All values'
name_1 = 'iter_1'
name_2 = 'iter_2'
count_1 = f'occurrences in {name_1}'
count_2 = f'occurrences in {name_2}'
# Create a DataFrame with unique values from both iterators
r[values] = pd.concat([i1, i2]).unique()

# Count occurrences of each unique value in the first iterator, managing NaN values
r[count_1] = r[values].map(lambda x: i1.value_counts()[x] if x in i1.value_counts() else i1.isna().sum() if pd.isna(x) else 0)

# Count occurrences of each unique value in the second iterator, managing NaN values
r[count_2] = r[values].map(lambda x: i2.value_counts()[x] if x in i2.value_counts() else i2.isna().sum() if pd.isna(x) else 0)


In [54]:
x = r[values][0]
i1.value_counts()[x]

np.int64(1)

In [71]:
print(r)

   All values  occurrences in iter_1  occurrences in iter_2
0         1.0                      1                      1
1         2.0                      1                      0
2         3.0                      3                      2
3         4.0                      1                      1
4         5.0                      1                      2
5         6.0                      1                      0
6         NaN                      2                      1
7         8.0                      0                      1


In [16]:
def xs2xs(iter_1, iter_2, name_1 = 'iter_1', name_2 = 'iter_2'):
    '''
    This function is intended to check whether two iterables have the same values, i.e. whether a 1 to 1 mapping is possible.
    Provided iter_1 and iter_2, xs2xs checks whether there are values in one of them that are absent in the other.
    It returns a data frame where the first field lists all values in either iterable; the second field indicates how many occurrences
    of the element are present in iter_1; the third field ndicates how many occurrences of the element are present in iter_2. The
    data frame is ordered as to show first the elements present in only one of the two iterables.
    '''
    r = pd.DataFrame()
    i1 = pd.Series(iter_1)
    i2 = pd.Series(iter_2)

    # Assigning names to the columns
    values = 'All values'
    count_1 = f'occurrences in {name_1}'
    count_2 = f'occurrences in {name_2}'
    # Create a DataFrame with unique values from both iterators
    r[values] = pd.concat([i1, i2]).unique()
    
    # Count occurrences of each unique value in the first iterator (if-else bc NaNs would be counted NaNs by value_counts)
    r[count_1] = r[values].map(lambda x: i1.isna().sum() if pd.isna(x) else i1.value_counts())
    
    # Count occurrences of each unique value in the second iterator
    r[count_2] = r[values].map(i2.value_counts())

    # NaN values counts actually indicate 0 occurrences 
    # r[count_1] = r[count_1].fillna(0)
    # r[count_2] = r[count_2].fillna(0)

    # Show unmatched values first
    r = r.sort_values(by=[count_1, count_2], ascending=[True, True])

    # Print stuff that is good to know:
    # 1. The number of NaNs in each iterator
    nans_1 = i1.isna().sum()
    nans_1_relative = nans_1 / len(i1) * 100
    nans_2 = i2.isna().sum()
    nans_2_relative = nans_2 / len(i2) * 100
    # 2. The percentage of 1 to 1 mapping
    unmatched = ((r[count_1] == 0) | (r[count_2] == 0)).sum()
    unmatched_relative = unmatched / len(r) * 100
    bijective_mapping_relative = 100 - unmatched_relative

    print(f'Number of NaNs in {name_1}: {nans_1} ({nans_1_relative:.2f}%)')
    print(f'Number of NaNs in {name_2}: {nans_2} ({nans_2_relative:.2f}%)')
    print(f'Percentage of unmatched values: {unmatched_relative:.2f}%')
    print(f'Percentage of bijective mapping: {bijective_mapping_relative:.2f}%')

    return r

In [17]:
print(xs2xs(iter_1, iter_2))

TypeError: unhashable type: 'Series'

# 3. Explore your text fields

In [None]:
# Print out the names of all the functions in the text_exploration module
print([item for item in dir(texplo) if not item.startswith('__')])

['finals', 'initials', 'pd']


### 3.1 Need trimming?
A long column of strings sometimes starts with undesirable characters like brackets or others. Let's quickly check what are the initials of our strings:

In [35]:
help(texplo.initials)

Help on function initials in module text_exploration:

initials(s)



In [34]:
help(texplo.finals)

Help on function finals in module text_exploration:

finals(s)



In [33]:
print(df_text, '\n')

print(texplo.finals(df_text.Column3))

          Column1 Column2    Column3
0       ##apple##     A1     @hello@
1        banana!!     B2   !!world!!
2      %%cherry%%     C3     python 
3          date@@     D4     rocks@@
4  @@elderberry@@     E5        None 

Column3
@       2
!       1
        1
None    1
Name: count, dtype: int64
