In [1]:
import pandas as pd
import re
import collections
import numpy as np

In [2]:
@pd.api.extensions.register_series_accessor("dvp")
class DVP:
    def __init__(self, pandas_obj):
        self._obj = pandas_obj
    
    @staticmethod
    def get_pattern(string, regex=False):
        """
        Using regex the pattern for each element of a string is derived, using the following convention
        digits : d
        letter (lower case) : a
        letter (upper case) : A
        space : s
        special character : *

        For example, the string 'python101' is represented by the pattern 'aaaaaaddd'. 
        """
        
        string = str(string)
        pattern = {j : 'd' for i in re.finditer(r'\d+', string) for j in range(i.start(),i.end())}
        pattern = {**pattern, **{j : 'a' for i in re.finditer(r'[a-z]+', string) for j in range(i.start(),i.end())}}
        pattern = {**pattern, **{j : 'A' for i in re.finditer(r'[A-Z]+', string) for j in range(i.start(),i.end())}}
        pattern = {**pattern, **{j : 's' for i in re.finditer(r'\s+', string) for j in range(i.start(),i.end())}}
        pattern = {**pattern, **{j : '*' for i in re.finditer(r'[^\w\s]', string) for j in range(i.start(),i.end())}}
        if regex:
            regex_dict = {'d':'\d', 'a':'[a-z]', 'A':'[A-Z]', 's':'\s', '*':'[^\w\s]'}
            pattern = {i : regex_dict[pattern[i]] for i in pattern.keys()}

        return(''.join(collections.OrderedDict(sorted(pattern.items())).values()))
    
    def pattern_counts(self, regex=False, normalize=False, sort=True, ascending=False, bins=None):
        """
        Returns object containing counts of unique patterns.
        
        The resulting object will be in descending order so that the
        first element is the most frequently-occurring element.
        Excludes NA values.
        
        A pattern is a meta description of values in the series. Each element in the series is transforemd to a string. 
        From there a pattern is derived atomically applying the following rule,
        digits : d
        letter (lower case) : a
        letter (upper case) : A
        space : s
        special character : *

        Parameters
        ----------
        normalize : boolean, default False
            If True then the object returned will contain the relative
            frequencies of the unique patterns.
        sort : boolean, default True
            Sort by values
        ascending : boolean, default False
            Sort in ascending order
        bins : integer, optional
            Rather than count values, group them into half-open bins,
            a convenience for pd.cut, only works with numeric data

        Returns
        -------
        counts : Series
        """
        
        series =  self._obj.dropna().apply(lambda x: self.get_pattern(x, regex=regex))
        return series.value_counts(sort=sort, ascending=ascending, normalize=normalize, bins=bins)

In [3]:
sample = pd.DataFrame({'string':["c't","Sprach . fahrplan","c#q",12, '', 'äa','   ', np.nan]})

In [4]:
sample.string.dvp.pattern_counts()

a*a                  2
Aaaaaas*saaaaaaaa    1
dd                   1
a                    1
sss                  1
                     1
Name: string, dtype: int64