# Exploration and Preprocessing Patrick

In [8]:
# Import relevant packages
import pandas as pd
import numpy as np
import os

## Data Loading

In [2]:
# List the folders containing the text files
folders = ['../data/raw/txt_files_0_1750', '../data/raw/txt_files_1751_3500', '../data/raw/txt_files_3501_5000']

# Initialize an empty list to store the data
data = []

# Iterate through each folder
for folder in folders:
    # Get the list of text files in the folder
    file_list = os.listdir(folder)
    
    # Iterate through each text file
    for file_name in file_list:
        # Read the contents of the file
        file_path = os.path.join(folder, file_name)
        try:
            with open(file_path, 'r') as file:
                content = file.read()
                # Append content to the data list
                data.append(content)
                
        except FileNotFoundError:
            print(f"File {file_name} not found.")
        except IOError:
            print(f"Error reading the file {file_name}.")
        
# Create a DataFrame from the data list
df = pd.DataFrame(data, columns=['Content'])
df

Unnamed: 0,Content
0,ARIVE | Your favorite brandsGet the appOur Vis...
1,Rückabwicklung - von Lebensversicherungen\nWir...
2,\nStarke Versicherungen für deinen Lifestyle...
3,\nRESTUBE - the airbag for more freedom and s...
4,CureVac - Wir revolutionieren die mRNA für das...
...,...
3275,\n LEMONSGATE ...
3276,Safily\n \n√úberspringen\nSuchen\nSuche schlie...
3277,Startseite | ICO-LUX\n \nFraud Prevention\nÜbe...
3278,Versus | Compare everything\nCategoriessmartph...


# <span style="color:green">To do: Detect pages with loading errors: example: vamosai.txt </span>:

<blockquote>
<p style="font-size: smaller;">
Seite nicht gefunden – DDG Landingpages
RU
UA
EN
Sign in
Sign up
RU
UA
EN
Sign in
Sign up
 
404 - Not Found
DDG Landingpages > 404
 
Hmm, we could not find what you are looking for.
I
</p>
</blockquote>

<blockquote>
<p style="font-size: smaller;">
Site is undergoing maintenance 
</p>
</blockquote>

## Cleaning the Html as preprocessing step for text translation

In [3]:
import re

#functions to clean Html as first step of the preprocessing
def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    cleantext = cleantext.replace("\n"," ")
    cleantext = cleantext.replace("√ú","Ü")
    cleantext = cleantext.replace("√ü","ß")
    cleantext = cleantext.replace("√∂", "ö")
    cleantext = cleantext.replace("√º", "ü")
    cleantext = cleantext.replace("√§", "ä")
    cleantext = cleantext.replace("&", "and")
    cleantext = cleantext.replace("‚Äú", " ")
    cleantext = cleantext.replace("¬†", " ")
    return cleantext

df['Content'] = df['Content'].apply(cleanHtml)
df

Unnamed: 0,Content
0,ARIVE | Your favorite brandsGet the appOur Vis...
1,Rückabwicklung - von Lebensversicherungen Wir ...
2,Starke Versicherungen für deinen Lifestyle ...
3,RESTUBE - the airbag for more freedom and sa...
4,CureVac - Wir revolutionieren die mRNA für das...
...,...
3275,LEMONSGATE ...
3276,Safily Überspringen Suchen Suche schließen Z...
3277,Startseite | ICO-LUX Fraud Prevention Über u...
3278,Versus | Compare everything Categoriessmartpho...


In [4]:
#test result and store the result in another dataframe 
df_result = df.copy(deep=True)
df_result


Unnamed: 0,Content
0,ARIVE | Your favorite brandsGet the appOur Vis...
1,Rückabwicklung - von Lebensversicherungen Wir ...
2,Starke Versicherungen für deinen Lifestyle ...
3,RESTUBE - the airbag for more freedom and sa...
4,CureVac - Wir revolutionieren die mRNA für das...
...,...
3275,LEMONSGATE ...
3276,Safily Überspringen Suchen Suche schließen Z...
3277,Startseite | ICO-LUX Fraud Prevention Über u...
3278,Versus | Compare everything Categoriessmartpho...


In [7]:
#load the result from steps before
df = df_result.copy(deep=True)
df

Unnamed: 0,Content
0,ARIVE | Your favorite brandsGet the appOur Vis...
1,Rückabwicklung - von Lebensversicherungen Wir ...
2,Starke Versicherungen für deinen Lifestyle ...
3,RESTUBE - the airbag for more freedom and sa...
4,CureVac - Wir revolutionieren die mRNA für das...
...,...
3275,LEMONSGATE ...
3276,Safily Überspringen Suchen Suche schließen Z...
3277,Startseite | ICO-LUX Fraud Prevention Über u...
3278,Versus | Compare everything Categoriessmartpho...


# Text translation

In [8]:
from langdetect import detect

#count number of texts in different langauges
lang_counters = {}
empty_fields_counter = 0
for index, row in df.iterrows():
    if len(row['Content']) != 0 and index != 2700:
        language = detect(row['Content'])
        df.at[index, 'language'] = language
        if language in lang_counters:
            lang_counters[language] += 1
        else:
            lang_counters[language] = 1
    else: 
        print(f"\033[33m Attention: \033[0m At index {index} no text is found and will be deleted.")
        empty_fields_counter += 1
        
        #delete row from Dataframe with empty text
        df = df.drop(index)
               
print(f"Numbers of texts in different languages: {lang_counters}")
print(f"Number of deleted rows because no text was found: {empty_fields_counter}")

[33m Attention: [0m At index 77 no text is found and will be deleted.
[33m Attention: [0m At index 134 no text is found and will be deleted.
[33m Attention: [0m At index 279 no text is found and will be deleted.
[33m Attention: [0m At index 323 no text is found and will be deleted.
[33m Attention: [0m At index 397 no text is found and will be deleted.
[33m Attention: [0m At index 423 no text is found and will be deleted.
[33m Attention: [0m At index 625 no text is found and will be deleted.
[33m Attention: [0m At index 851 no text is found and will be deleted.
[33m Attention: [0m At index 952 no text is found and will be deleted.
[33m Attention: [0m At index 1064 no text is found and will be deleted.
[33m Attention: [0m At index 1269 no text is found and will be deleted.
[33m Attention: [0m At index 1634 no text is found and will be deleted.
[33m Attention: [0m At index 1698 no text is found and will be deleted.
[33m Attention: [0m At index 1842 no text is f

In [9]:
#test result and store the result in another dataframe 
df_result_before_trans = df.copy(deep=True)
df_result_before_trans

Unnamed: 0,Content,language
0,ARIVE | Your favorite brandsGet the appOur Vis...,en
1,Rückabwicklung - von Lebensversicherungen Wir ...,de
2,Starke Versicherungen für deinen Lifestyle ...,de
3,RESTUBE - the airbag for more freedom and sa...,en
4,CureVac - Wir revolutionieren die mRNA für das...,de
...,...,...
3275,LEMONSGATE ...,de
3276,Safily Überspringen Suchen Suche schließen Z...,de
3277,Startseite | ICO-LUX Fraud Prevention Über u...,de
3278,Versus | Compare everything Categoriessmartpho...,en


In [6]:
#load the result from steps before
df = df_result_before_trans.copy()
df

Unnamed: 0,Content,language
0,ARIVE | Your favorite brandsGet the appOur Vis...,en
1,Rückabwicklung - von Lebensversicherungen Wir ...,de
2,Starke Versicherungen für deinen Lifestyle ...,de
3,RESTUBE - the airbag for more freedom and sa...,en
4,CureVac - Wir revolutionieren die mRNA für das...,de
...,...,...
3275,LEMONSGATE ...,de
3276,Safily Überspringen Suchen Suche schließen Z...,de
3277,Startseite | ICO-LUX Fraud Prevention Über u...,de
3278,Versus | Compare everything Categoriessmartpho...,en


In [11]:
#take the most used language
max_lang_count = max(lang_counters, key=lang_counters.get)
print(f"The most texts are written in '{max_lang_count}'")

#Generate column for translated content in most used language
df[f"Content_in_{max_lang_count}"] = None

The most texts are written in 'de'


## <span style="color:green">To do: Evaluate if splitting the text into blocks has a negativ impact on the performance of translation </span>
#### <span style="color:red">To do: Index 2771, 3090 and 3190 could not be translated </span>


In [42]:
from deep_translator import GoogleTranslator

#translate all texts which are not written in the choosen language 
for index, row in df.iterrows():
    if row[f'Content_in_{max_lang_count}'] == None and index != 2771 and index != 3090 and index != 3190:
        translated_chunks=[]
        if row['language'] != max_lang_count:
            if len(row['Content']) > 4900:
                # Split the text into smaller chunks for translation
                chunk_size = 4900  # Choose chank size
                chunks = [row['Content'][i:i+chunk_size] for i in range(0, len(row['Content']), chunk_size)]

                # Translate each chunk and join them back
                translated_chunks = []
                for chunk in chunks:
                    translation = GoogleTranslator(source='auto', target=max_lang_count).translate(chunk)
                    translated_chunks.append(translation)
                translation = ' '.join(translated_chunks)
            else:
                translation = GoogleTranslator(source='auto', target=max_lang_count).translate(row['Content'])
            df.at[index, f'Content_in_{max_lang_count}'] = translation
            print(f"Text at index {index} was translated.")
        else:
            df.at[index, f'Content_in_{max_lang_count}'] = row['Content']

df.rename(columns={'language': 'original_language'}, inplace=True)    
       

3191
3192
Text at index 3192 was translated.
3193
Text at index 3193 was translated.
3194
3195
3196
3197
Text at index 3197 was translated.
3198
3199
3200
3201
3202
3203
Text at index 3203 was translated.
3204
Text at index 3204 was translated.
3205
Text at index 3205 was translated.
3206
Text at index 3206 was translated.
3207
Text at index 3207 was translated.
3208
3209
Text at index 3209 was translated.
3210
3211
3212
Text at index 3212 was translated.
3213
Text at index 3213 was translated.
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
Text at index 3225 was translated.
3226
3227
Text at index 3227 was translated.
3228
3229
3230
Text at index 3230 was translated.
3231
3232
3233
3234
Text at index 3234 was translated.
3235
3236
3237
3238
3239
3240
Text at index 3240 was translated.
3241
3242
3243
3244
Text at index 3244 was translated.
3245
3246
3247
3248
Text at index 3248 was translated.
3249
Text at index 3249 was translated.
3250
3251
3252
Text at index 3252 was tr

In [43]:
#for testing result while or after translation
df.head()

Unnamed: 0,Content,original_language,Content_in_de
0,ARIVE | Your favorite brandsGet the appOur Vis...,en,ANKOMMEN | Ihre LieblingsmarkenHolen Sie sich ...
1,Rückabwicklung - von Lebensversicherungen Wir ...,de,Rückabwicklung - von Lebensversicherungen Wir ...
2,Starke Versicherungen für deinen Lifestyle ...,de,Starke Versicherungen für deinen Lifestyle ...
3,RESTUBE - the airbag for more freedom and sa...,en,RESTUBE - the airbag for more freedom and safe...
4,CureVac - Wir revolutionieren die mRNA für das...,de,CureVac - Wir revolutionieren die mRNA für das...


In [44]:
#test result and store the result in another dataframe 
df_result_after_translation = df.copy()
df_result_after_translation

Unnamed: 0,Content,original_language,Content_in_de
0,ARIVE | Your favorite brandsGet the appOur Vis...,en,ANKOMMEN | Ihre LieblingsmarkenHolen Sie sich ...
1,Rückabwicklung - von Lebensversicherungen Wir ...,de,Rückabwicklung - von Lebensversicherungen Wir ...
2,Starke Versicherungen für deinen Lifestyle ...,de,Starke Versicherungen für deinen Lifestyle ...
3,RESTUBE - the airbag for more freedom and sa...,en,RESTUBE - the airbag for more freedom and safe...
4,CureVac - Wir revolutionieren die mRNA für das...,de,CureVac - Wir revolutionieren die mRNA für das...
...,...,...,...
3275,LEMONSGATE ...,de,LEMONSGATE ...
3276,Safily Überspringen Suchen Suche schließen Z...,de,Safily Überspringen Suchen Suche schließen Z...
3277,Startseite | ICO-LUX Fraud Prevention Über u...,de,Startseite | ICO-LUX Fraud Prevention Über u...
3278,Versus | Compare everything Categoriessmartpho...,en,Versus | Alles vergleichen KategorienSmartphon...


In [5]:
#load the result from steps before
df = df_result_after_translation.copy()
df

#store result as json-file
df.to_json('translated_results.json')

# Data Cleaning

## Preprocesssing Functions from the exercise

In [81]:
#load data from transalted json and switch the transalted-Content as 'Content'
df = pd.read_json('translated_results.json')

# Drop the "translatedContent" column
df.drop("Content", axis=1, inplace=True)

# Rename the "Content_in_de" column to replace the dropped column
df.rename(columns={"Content_in_de": "Content"}, inplace=True)

# Display the modified DataFrame
print(df)

     original_language                                            Content
0                   en  ANKOMMEN | Ihre LieblingsmarkenHolen Sie sich ...
1                   de  Rückabwicklung - von Lebensversicherungen Wir ...
2                   de     Starke Versicherungen für deinen Lifestyle ...
3                   en  RESTUBE - the airbag for more freedom and safe...
4                   de  CureVac - Wir revolutionieren die mRNA für das...
...                ...                                                ...
3275                de                      LEMONSGATE                ...
3276                de  Safily   Überspringen Suchen Suche schließen Z...
3277                de  Startseite | ICO-LUX   Fraud Prevention Über u...
3278                en  Versus | Alles vergleichen KategorienSmartphon...
3279                de  Selfstorage München and deutschlandweit | Stor...

[3258 rows x 2 columns]


In [74]:
# import packages
import re

# import pandas as pd
# import numpy as np

# import matplotlib.pyplot as plt
# import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
nltk.download('stopwords')


from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB

# from skmultilearn.problem_transform import BinaryRelevance

[nltk_data] Downloading package stopwords to /Users/kathi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [84]:
# define functions

def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]', r'', sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]', r' ', cleaned)
    cleaned = re.sub(r'[äÄ]', 'ä', cleaned)
    cleaned = re.sub(r'[öÖ]', 'ö', cleaned)
    cleaned = re.sub(r'[üÜ]', 'ü', cleaned)
    cleaned = cleaned.strip()
    return cleaned

def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-zA-ZäöüÄÖÜß]+', ' ', word, flags=re.UNICODE)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

def removeStopWords(sentence, stopwords):
    re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)   
    return re_stop_words.sub(" ", sentence)

def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence

# define stopwords
stop_words = set(stopwords.words(['german', 'english']))

# define stemmer
stemmer = SnowballStemmer('german')

df['Content'] = df['Content'].astype(str)
# apply functions
df['Content'] = df['Content'].str.lower()
df['Content'] = df['Content'].apply(cleanPunc)
df['Content'] = df['Content'].apply(keepAlpha)
df['Content'] = df['Content'].apply(removeStopWords, stopwords=stopwords)
df['Content'] = df['Content'].apply(stemming)

#### <span style="color: orange">Note: </span>Umlautnormalisierung geschieht beim stemmer, obwohl deutsch eingestellt ist

In [85]:
# check result
df.head(50)

Unnamed: 0,original_language,Content
0,en,ankomm lieblingsmarkenhol appuns visionpartner...
1,de,ruckabwickl lebensversicher hol doppelt leb re...
2,de,stark versicher lifestyl held de hom produkt f...
3,en,restub airbag freedom safety wat restub skip c...
4,de,curevac revolutioni mrna leb mensch fuhrungste...
5,en,hom open bank project inhalt spring produkt op...
6,en,viamon gmbh hom de hom technologi servic konta...
7,en,hivebuy unternehmensweit ord rechnung losung h...
8,de,startseit greenflash start mission referenz pr...
9,de,sit undergoing maintenanc doctorsgat wartungsm...


In [None]:
# function for tokenization -> only use when vectorizer doesn't include tokenization (probably included)
#def tokenize(text):
#    tokens = re.split(r'\W+', text)

#    return tokens

# applying function to the column
#df['Content'] = df['Content'].apply(lambda x: tokenize(x))
#df

In [43]:
# Specifiy the path to the folder where you want to save the CSV file
folder_path = '../data/preprocessed'

# Specify the file name and full path for the CSV file
file = 'preprocessed_data.csv'
path = os.path.join(folder_path, file)

# Save the preprocessed data to a CSV file
df.to_csv(path, index=False)

In [47]:
# generate requirements.txt without some weird paths instead of the package version
!pip list --format=freeze > requirements_preprocessing.txt