In [2]:
import numpy as np
import pandas as pd

#import text_exploration as texplo

In [8]:
# A few dataframes for testing:

data_multitype = {
    'Column1': [1, 2.5, 3, 4.8, 5],     # Mixed integers and floats
    'Column2': ['A', 'B', 'C', 'D', 5], # Strings and an integer
    'Column3': [True, False, np.nan, 1, 'Mixed']  # Booleans, NaN, integer, and string
}

df_multitype = pd.DataFrame(data_multitype)

data_nans = {
    'Column1': [1, np.nan, np.nan, 4, 5],       # Contains NaN values
    'Column2': ['A', 'B', np.nan, np.nan, 'E'], # Contains NaN values
    'Column3': [True, np.nan, np.nan, True, False]  # Contains NaN values
}

df_nans = pd.DataFrame(data_nans)

df_maps1 = pd.DataFrame(
    {
        'a': [1, 2, 3, 4, 3],
        'b': [1, 3, 1, 3, 1],
        'c': ['lollo', 'gigio', 'lollo', 'gigio', 'lollo']
    })

df_maps2 = pd.DataFrame(
    {
        'a': [1, 2, 3, 4],
        'b': [1, 1, 3, 5]
    }
)

# 1. Data Types

In [27]:
import data_types as types

# Print out the names of all the functions in the data_types module
print([item for item in dir(types) if not item.startswith('__')])

['col_types', 'nan2nan', 'pd']


`types.col_types(df)` prints out the number of element types found in each column of a given pandas Data Frame:

In [None]:
help(types.col_types)

In [42]:
print(df_multitype,'\n')

types.col_types(df_multitype)

   Column1 Column2 Column3
0      1.0       A    True
1      2.5       B   False
2      3.0       C     NaN
3      4.8       D       1
4      5.0       5   Mixed 

Column1
<class 'float'>    5
Name: count, dtype: int64 
 NaN values:  0 

Column2
<class 'str'>    4
<class 'int'>    1
Name: count, dtype: int64 
 NaN values:  0 

Column3
<class 'bool'>     2
<class 'float'>    1
<class 'int'>      1
<class 'str'>      1
Name: count, dtype: int64 
 NaN values:  1 



`types.nan2nan` serves to answer the question: Are NaN values in col_1 also NaN in col 2?

In [41]:
print(df_nans,'\n')

types.nan2nan(df_nans, 'Column1', 'Column2')

   Column1 Column2 Column3
0      1.0       A    True
1      NaN       B     NaN
2      NaN     NaN     NaN
3      4.0     NaN    True
4      5.0       E   False 

NaN values in Column1:  2 

NaN values in Column2:  2 

NaN values in the same row for both Column1 and Column2:  1


# 2. Mappings

In [5]:
import df_mappings as map

print([item for item in dir(map) if not item.startswith('__')])

['foreign_k2k', 'map_col2col', 'occurrences', 'pd']


`maps.map_col2col`: given 2 columns in a data frame (or any two same length columns), one might be interested in how they map onto each other. Is it a 1 to 1 mapping? Does it define a function? And if so, is the function injective of surjective?

The following function, given col_1 and col_2, returns, for each value in col_1, how many values it is associated to in col_2:

In [None]:
# Try it out here:
print(df_maps1, '\n')
print(map_col2col(df_maps1.b, df_maps1.c))

   a  b      c
0  1  1  lollo
1  2  3  gigio
2  3  1  lollo
3  4  3  gigio
4  3  1  lollo 

   gigio  lollo
1  False   True
3   True  False


By calling `map_count(col_1, col_2)` and then `map_count(col_2, col_1)`, one will be able to determine the relation between the two columns.

`map.foreign_k2k`: are you wandering whether two columns from different tables have the same unique values?

In [9]:

map.foreign_k2k(df_maps2.a, df_maps2.b)

Unnamed: 0,table1.a_values,matches_in_table2.b > -1
0,1,2
1,2,0
2,3,1
3,4,0


# 4. Need Trimming?
A long column of strings sometimes starts with undesirable characters like brackets or others. Let's quickly check what are the initials of our strings:

In [11]:
def initials(s):
    # Given a Series of strings, it retruns all unique first characters
    s = s.map(lambda x: x[0] if isinstance(x, str) else x)
    return s.value_counts(dropna = False)

In [13]:
def finals(s):
    # Given a Series of strings, it retruns all unique first characters
    s = s.map(lambda x: x[-1] if isinstance(x, str) else x)
    return s.value_counts(dropna = False)