# cTF-IDF

Creates TF-IDF top 10 keywords for different timeframes (Year, Quarter, Season, Month) and different groups (Age,Gender,Fahrzweck,Klasse....)

In [1]:
import sys
import os
sys.path.append(os.path.abspath('../')) ## needed to import the function.py file

from functions import *
import pandas as pd
import plotly.express as px
import plotly.io as pio
import spacy
import xlsxwriter

# nltk.download('averaged_perceptron_tagger')
# nltk.download('punkt')
# nltk.download('stopwords')
#!python -m spacy download de_core_news_lg
## download nlp language package

# Load the German model
nlp = spacy.load("de_core_news_lg")

2023-05-24 07:30:13.655128: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
import numpy as np
import pandas as pd
import scipy.sparse as sp

from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer


class CTFIDFVectorizer(TfidfTransformer):
    def __init__(self, *args, **kwargs):
        super(CTFIDFVectorizer, self).__init__(*args, **kwargs)

    def fit(self, X: sp.csr_matrix, n_samples: int):
        """Learn the idf vector (global term weights) """
        _, n_features = X.shape
        df = np.squeeze(np.asarray(X.sum(axis=0)))
        idf = np.log(n_samples / df)
        self._idf_diag = sp.diags(idf, offsets=0,
                                  shape=(n_features, n_features),
                                  format='csr',
                                  dtype=np.float64)
        return self

    def transform(self, X: sp.csr_matrix) -> sp.csr_matrix:
        """Transform a count-based matrix to c-TF-IDF """
        X = X * self._idf_diag
        X = normalize(X, axis=1, norm='l1', copy=False)
        return X
    
# Credits: https://www.maartengrootendorst.com/blog/ctfidf/


# Get data
from sklearn.datasets import fetch_20newsgroups
newsgroups = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

# Create documents per label
docs = pd.DataFrame({'Document': newsgroups.data, 'Class': newsgroups.target})
docs_per_class = docs.groupby(['Class'], as_index=False).agg({'Document': ' '.join})

# Create c-TF-IDF
count = CountVectorizer().fit_transform(docs_per_class.Document)
ctfidf = CTFIDFVectorizer().fit_transform(count, n_samples=len(docs))


# Create bag of words
count_vectorizer = CountVectorizer().fit(docs_per_class.Document)
count = count_vectorizer.transform(docs_per_class.Document)
words = count_vectorizer.get_feature_names_out()

# Extract top 10 words per class
ctfidf = CTFIDFVectorizer().fit_transform(count, n_samples=len(docs)).toarray()
words_per_class = {newsgroups.target_names[label]: [words[index] for index in ctfidf[label].argsort()[-10:]] 
                   for label in docs_per_class.Class}




### Load Data

In [2]:
## Import dataframe for overview with all Surveys from 2019-2022
filelocation = '../../data/DataClean'
df = pd.read_feather(filelocation)

## Import textbased dataframe with all Surveys from 2019-2022 with text comments and prerocessed columns
filelocation = '../../data/DataText'
df_text = pd.read_feather(filelocation)

## load config file
config = pd.read_excel('../../config/config.xlsx',sheet_name='fragecodes')
invites_month = pd.read_excel('../../config/config.xlsx',sheet_name='invites')

In [48]:
import numpy as np
import pandas as pd
import scipy.sparse as sp

from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer


class CTFIDFVectorizer(TfidfTransformer):
    def __init__(self, *args, **kwargs):
        super(CTFIDFVectorizer, self).__init__(*args, **kwargs)

    def fit(self, X: sp.csr_matrix, n_samples: int):
        """Learn the idf vector (global term weights) """
        _, n_features = X.shape
        df = np.squeeze(np.asarray(X.sum(axis=0)))
        idf = np.log(n_samples / df)
        self._idf_diag = sp.diags(idf, offsets=0,
                                  shape=(n_features, n_features),
                                  format='csr',
                                  dtype=np.float64)
        return self

    def transform(self, X: sp.csr_matrix) -> sp.csr_matrix:
        """Transform a count-based matrix to c-TF-IDF """
        X = X * self._idf_diag
        X = normalize(X, axis=1, norm='l1', copy=False)
        return X


def extract_top_words_per_group(df, text_column, group_column):
    """Extract top 10 words per group from a DataFrame with comments and group information."""
    # Create documents per group
    docs_per_group = df.groupby([group_column], as_index=False).agg({text_column: ' '.join})

    # Create c-TF-IDF
    count_vectorizer = CountVectorizer().fit(docs_per_group[text_column])
    count = count_vectorizer.transform(docs_per_group[text_column])
    ctfidf = CTFIDFVectorizer().fit_transform(count, n_samples=len(df))

    # Create bag of words
    words = count_vectorizer.get_feature_names_out()

    # Extract top 10 words per group
    ctfidf = CTFIDFVectorizer().fit_transform(count, n_samples=len(df)).toarray()
    words_per_group = {group: [words[index] for index in ctfidf[i].argsort()[-10:]] for i, group in enumerate(docs_per_group[group_column])}

    return words_per_group

In [51]:
df_text.columns

Index(['participant_id', 'u_date', 'year', 'month', 'quarter', 'yearmonth',
       'yearquarter', 'season', 'yearseason', 'Kommentar', 'wime_personal',
       'wime_komfort', 'wime_sauberkeit', 'wime_puenktlich',
       'wime_platzangebot', 'wime_gesamtzuf', 'wime_preis_leistung',
       'wime_fahrplan', 'wime_oes_fahrt', 'S_sprache', 'S_alter', 'S_sex',
       'S_wohnsitz', 'u_klassencode', 'S_AB3_HTA', 'R_anschluss', 'R_stoerung',
       'device_type', 'dispcode', 'u_ticket', 'u_fahrausweis', 'u_preis',
       'R_zweck', 'ft_abfahrt', 'ft_ankunft', 'ft_startort_uic', 'ft_tu',
       'ft_vm', 'ft_vm_kurz', 'ft_zielort_uic', 'fg_abfahrt', 'fg_ankunft',
       'fg_startort_uic', 'fg_zielort_uic', 'fg_startort', 'fg_zielort',
       'ft_startort', 'ft_zielort', 'Kommentar_Character', 'Kommentar_Tokens',
       'Kommentar_Types', 'Kommentar_TTR', 'text_preprocessed',
       'text_preprocessed_tokenized', 'lemmatized', 'nouns', 'adjectives',
       'verbs', 'nouns_adjectives_and_verbs'],
 

In [52]:
extract_top_words_per_group(df=df_text, text_column='text_preprocessed', group_column="yearquarter")

{'2019Q1': ['ticket',
  'werden',
  'strecke',
  'ist',
  'verbindung',
  'fahrt',
  'züge',
  'klasse',
  'app',
  'zug'],
 '2019Q2': ['ticket',
  'wagen',
  'ist',
  'strecke',
  'werden',
  'züge',
  'fahrt',
  'klasse',
  'app',
  'zug'],
 '2019Q3': ['werden',
  'ticket',
  'fahrt',
  'ist',
  'wagen',
  'strecke',
  'klasse',
  'züge',
  'app',
  'zug'],
 '2019Q4': ['verbindung',
  'wagen',
  'strecke',
  'minuten',
  'fahrt',
  'verspätung',
  'klasse',
  'züge',
  'app',
  'zug'],
 '2020Q1': ['ist',
  'verbindung',
  'klasse',
  'ticket',
  'strecke',
  'fahrt',
  'züge',
  'verspätung',
  'app',
  'zug'],
 '2020Q2': ['verbindung',
  'fahrt',
  'züge',
  'tragen',
  'öv',
  'maske',
  'app',
  'corona',
  'zug',
  'maskenpflicht'],
 '2020Q3': ['velo',
  'werden',
  'ticket',
  'verbindung',
  'fahrt',
  'ist',
  'maske',
  'app',
  'maskenpflicht',
  'zug'],
 '2020Q4': ['maskenpflicht',
  'werden',
  'züge',
  'klasse',
  'ist',
  'fahrt',
  'app',
  'maske',
  'corona',
  'zug'