# Exploration and Preprocessing Patrick

In [304]:
# Import relevant packages
import pandas as pd
import numpy as np
import os

## Data Loading

In [320]:
# List the folders containing the text files
folders = ['../data/raw/txt_files_0_1750', '../data/raw/txt_files_1751_3500', '../data/raw/txt_files_3501_5000']

# Initialize an empty list to store the data
data = []

# Iterate through each folder
for folder in folders:
    # Get the list of text files in the folder
    file_list = os.listdir(folder)
    
    # Iterate through each text file
    for file_name in file_list:
        # Read the contents of the file
        file_path = os.path.join(folder, file_name)
        try:
            with open(file_path, 'r') as file:
                content = file.read()
                # Append content to the data list
                data.append(content)
                
        except FileNotFoundError:
            print(f"File {file_name} not found.")
        except IOError:
            print(f"Error reading the file {file_name}.")
        
# Create a DataFrame from the data list
df = pd.DataFrame(data, columns=['Content'])
df

Unnamed: 0,Content
0,ARIVE | Your favorite brandsGet the appOur Vis...
1,Rückabwicklung - von Lebensversicherungen\nWir...
2,\nStarke Versicherungen für deinen Lifestyle...
3,\nRESTUBE - the airbag for more freedom and s...
4,CureVac - Wir revolutionieren die mRNA für das...
...,...
3275,\n LEMONSGATE ...
3276,Safily\n \n√úberspringen\nSuchen\nSuche schlie...
3277,Startseite | ICO-LUX\n \nFraud Prevention\nÜbe...
3278,Versus | Compare everything\nCategoriessmartph...


# <span style="color:green">To do: Detect pages with loading errors: example: vamosai.txt </span>:

<blockquote>
<p style="font-size: smaller;">
Seite nicht gefunden – DDG Landingpages
RU
UA
EN
Sign in
Sign up
RU
UA
EN
Sign in
Sign up
 
404 - Not Found
DDG Landingpages > 404
 
Hmm, we could not find what you are looking for.
I
</p>
</blockquote>

<blockquote>
<p style="font-size: smaller;">
Site is undergoing maintenance 
</p>
</blockquote>

## Cleaning the Html as preprocessing step for text translation

In [321]:
import re

#functions to clean Html as first step of the preprocessing
def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    cleantext = cleantext.replace("\n"," ")
    cleantext = cleantext.replace("√ú","Ü")
    cleantext = cleantext.replace("√ü","ß")
    cleantext = cleantext.replace("√∂", "ö")
    cleantext = cleantext.replace("√º", "ü")
    cleantext = cleantext.replace("√§", "ä")
    cleantext = cleantext.replace("&", "and")
    cleantext = cleantext.replace("‚Äú", " ")
    cleantext = cleantext.replace("¬†", " ")
    return cleantext

df['Content'] = df['Content'].apply(cleanHtml)
df

Unnamed: 0,Content
0,ARIVE | Your favorite brandsGet the appOur Vis...
1,Rückabwicklung - von Lebensversicherungen Wir ...
2,Starke Versicherungen für deinen Lifestyle ...
3,RESTUBE - the airbag for more freedom and sa...
4,CureVac - Wir revolutionieren die mRNA für das...
...,...
3275,LEMONSGATE ...
3276,Safily Überspringen Suchen Suche schließen Z...
3277,Startseite | ICO-LUX Fraud Prevention Über u...
3278,Versus | Compare everything Categoriessmartpho...


In [322]:
#test result and store the result in another dataframe 
df_result = df.copy(deep=True)
df_result


Unnamed: 0,Content
0,ARIVE | Your favorite brandsGet the appOur Vis...
1,Rückabwicklung - von Lebensversicherungen Wir ...
2,Starke Versicherungen für deinen Lifestyle ...
3,RESTUBE - the airbag for more freedom and sa...
4,CureVac - Wir revolutionieren die mRNA für das...
...,...
3275,LEMONSGATE ...
3276,Safily Überspringen Suchen Suche schließen Z...
3277,Startseite | ICO-LUX Fraud Prevention Über u...
3278,Versus | Compare everything Categoriessmartpho...


In [323]:
#load the result from steps before
df = df_result.copy(deep=True)
df

Unnamed: 0,Content
0,ARIVE | Your favorite brandsGet the appOur Vis...
1,Rückabwicklung - von Lebensversicherungen Wir ...
2,Starke Versicherungen für deinen Lifestyle ...
3,RESTUBE - the airbag for more freedom and sa...
4,CureVac - Wir revolutionieren die mRNA für das...
...,...
3275,LEMONSGATE ...
3276,Safily Überspringen Suchen Suche schließen Z...
3277,Startseite | ICO-LUX Fraud Prevention Über u...
3278,Versus | Compare everything Categoriessmartpho...


# Text translation

In [324]:
from langdetect import detect

#count number of texts in different langauges
lang_counters = {}
empty_fields_counter = 0
for index, row in df.iterrows():
    if len(row['Content']) != 0 and index != 2700:
        language = detect(row['Content'])
        df.at[index, 'language'] = language
        if language in lang_counters:
            lang_counters[language] += 1
        else:
            lang_counters[language] = 1
    else: 
        print(f"\033[33m Attention: \033[0m At index {index} no text is found and will be deleted.")
        empty_fields_counter += 1
        
        #delete row from Dataframe with empty text
        df = df.drop(index)
               
print(f"Numbers of texts in different languages: {lang_counters}")
print(f"Number of deleted rows because no text was found: {empty_fields_counter}")

[33m Attention: [0m At index 77 no text is found and will be deleted.
[33m Attention: [0m At index 134 no text is found and will be deleted.
[33m Attention: [0m At index 279 no text is found and will be deleted.
[33m Attention: [0m At index 323 no text is found and will be deleted.
[33m Attention: [0m At index 397 no text is found and will be deleted.
[33m Attention: [0m At index 423 no text is found and will be deleted.
[33m Attention: [0m At index 625 no text is found and will be deleted.
[33m Attention: [0m At index 851 no text is found and will be deleted.
[33m Attention: [0m At index 952 no text is found and will be deleted.
[33m Attention: [0m At index 1064 no text is found and will be deleted.
[33m Attention: [0m At index 1269 no text is found and will be deleted.
[33m Attention: [0m At index 1634 no text is found and will be deleted.
[33m Attention: [0m At index 1698 no text is found and will be deleted.
[33m Attention: [0m At index 1842 no text is f

In [327]:
#test result and store the result in another dataframe 
df_result_before_trans = df.copy(deep=True)
df_result_before_trans

Unnamed: 0,Content,language
0,ARIVE | Your favorite brandsGet the appOur Vis...,en
1,Rückabwicklung - von Lebensversicherungen Wir ...,de
2,Starke Versicherungen für deinen Lifestyle ...,de
3,RESTUBE - the airbag for more freedom and sa...,en
4,CureVac - Wir revolutionieren die mRNA für das...,de
...,...,...
3275,LEMONSGATE ...,de
3276,Safily Überspringen Suchen Suche schließen Z...,de
3277,Startseite | ICO-LUX Fraud Prevention Über u...,de
3278,Versus | Compare everything Categoriessmartpho...,en


In [333]:
#load the result from steps before
df = df_result_before_trans.copy()
df

Unnamed: 0,Content,language
0,ARIVE | Your favorite brandsGet the appOur Vis...,en
1,Rückabwicklung - von Lebensversicherungen Wir ...,de
2,Starke Versicherungen für deinen Lifestyle ...,de
3,RESTUBE - the airbag for more freedom and sa...,en
4,CureVac - Wir revolutionieren die mRNA für das...,de
...,...,...
3275,LEMONSGATE ...,de
3276,Safily Überspringen Suchen Suche schließen Z...,de
3277,Startseite | ICO-LUX Fraud Prevention Über u...,de
3278,Versus | Compare everything Categoriessmartpho...,en


In [347]:
#take the most used language
max_lang_count = max(lang_counters, key=lang_counters.get)
print(f"The most texts are written in '{max_lang_count}'")

#Generate column for translated content in most used language
df[f"Content_in_{max_lang_count}"] = None

The most texts are written in 'de


## <span style="color:green">To do: Evaluate if splitting the text into blocks has a negativ impact on the performance of translation </span>


In [362]:
from deep_translator import GoogleTranslator

#translate all texts which are not written in the choosen language 
for index, row in df.iterrows():
    if row[f'Content_in_{max_lang_count}'] == None:
        translated_chunks=[]
        if row['language'] != max_lang_count:
            if len(row['Content']) > 4900:
                # Split the text into smaller chunks for translation
                chunk_size = 4900  # Choose chank size
                chunks = [row['Content'][i:i+chunk_size] for i in range(0, len(row['Content']), chunk_size)]

                # Translate each chunk and join them back
                translated_chunks = []
                for chunk in chunks:
                    translation = GoogleTranslator(source='auto', target=max_lang_count).translate(chunk)
                    translated_chunks.append(translation)
                translation = ' '.join(translated_chunks)
            else:
                translation = GoogleTranslator(source='auto', target=max_lang_count).translate(row['Content'])
            df.at[index, f'Content_in_{max_lang_count}'] = translation
            print(f"Text at index {index} was translated.")
        else:
            df.at[index, f'Content_in_{max_lang_count}'] = row['Content']

df.rename(columns={'language': 'original_language'}, inplace=True)    
       

Text at index 333 was translated.
Text at index 334 was translated.
Text at index 335 was translated.
Text at index 337 was translated.
Text at index 338 was translated.
Text at index 339 was translated.
Text at index 342 was translated.
Text at index 344 was translated.
Text at index 345 was translated.
Text at index 347 was translated.
Text at index 348 was translated.
Text at index 350 was translated.
Text at index 351 was translated.
Text at index 356 was translated.
Text at index 358 was translated.
Text at index 361 was translated.
Text at index 362 was translated.
Text at index 363 was translated.
Text at index 371 was translated.
Text at index 373 was translated.
Text at index 376 was translated.
Text at index 380 was translated.
Text at index 382 was translated.
Text at index 394 was translated.
Text at index 395 was translated.
Text at index 396 was translated.
Text at index 399 was translated.
Text at index 400 was translated.
Text at index 401 was translated.
Text at index 

In [361]:
#for testing result while or after translation
df.head()

Unnamed: 0,Content,language,Content_in_de
0,ARIVE | Your favorite brandsGet the appOur Vis...,en,ANKOMMEN | Ihre LieblingsmarkenHolen Sie sich ...
1,Rückabwicklung - von Lebensversicherungen Wir ...,de,Rückabwicklung - von Lebensversicherungen Wir ...
2,Starke Versicherungen für deinen Lifestyle ...,de,Starke Versicherungen für deinen Lifestyle ...
3,RESTUBE - the airbag for more freedom and sa...,en,RESTUBE - the airbag for more freedom and safe...
4,CureVac - Wir revolutionieren die mRNA für das...,de,CureVac - Wir revolutionieren die mRNA für das...
5,Home - Open Bank Project Skip to content ...,en,Home - Open Bank Project Zum Inhalt springen P...
6,viamon GmbH :: Home DE Home Technology and Se...,en,viamon GmbH :: Home DE Home Technologie und Se...
7,Hivebuy: Your company-wide ordering to invoice...,en,Hivebuy: Ihre unternehmensweite Order-to-Rechn...
8,Startseite – Greenflash Start Über uns Unsere ...,de,Startseite – Greenflash Start Über uns Unsere ...
9,Site is undergoing maintenance Doctorsgat...,de,Site is undergoing maintenance Doctorsgat...


In [360]:
#test result and store the result in another dataframe 
df_result_after_translation = df.copy()
df_result_after_translation

Unnamed: 0,Content,language,Content_in_de
0,ARIVE | Your favorite brandsGet the appOur Vis...,en,ANKOMMEN | Ihre LieblingsmarkenHolen Sie sich ...
1,Rückabwicklung - von Lebensversicherungen Wir ...,de,Rückabwicklung - von Lebensversicherungen Wir ...
2,Starke Versicherungen für deinen Lifestyle ...,de,Starke Versicherungen für deinen Lifestyle ...
3,RESTUBE - the airbag for more freedom and sa...,en,RESTUBE - the airbag for more freedom and safe...
4,CureVac - Wir revolutionieren die mRNA für das...,de,CureVac - Wir revolutionieren die mRNA für das...
...,...,...,...
3275,LEMONSGATE ...,de,
3276,Safily Überspringen Suchen Suche schließen Z...,de,
3277,Startseite | ICO-LUX Fraud Prevention Über u...,de,
3278,Versus | Compare everything Categoriessmartpho...,en,


In [None]:
#load the result from steps before
df = df_result_after_translation.copy()
df

# Data Cleaning

## Preprocesssing Functions from the exercise

In [211]:
# import packages
import re

# import pandas as pd
# import numpy as np

# import matplotlib.pyplot as plt
# import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer


from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB

# from skmultilearn.problem_transform import BinaryRelevance

In [359]:
# define functions

def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'', sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ', cleaned)
    cleaned = cleaned.strip()
    return cleaned

def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

def removeStopWords(sentence, stopwords):
    re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)   
    return re_stop_words.sub(" ", sentence)

def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence

# define stopwords
stop_words = set(stopwords.words(['german', 'english']))

# define stemmer
stemmer = SnowballStemmer('german', 'english')

# apply functions
df['Content'] = df['Content'].str.lower()
df['Content'] = df['Content'].apply(cleanPunc)
df['Content'] = df['Content'].apply(keepAlpha)
df['Content'] = df['Content'].apply(removeStopWords, stopwords=stopwords)
df['Content'] = df['Content'].apply(stemming)

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/Users/kathi/nltk_data'
    - '/Users/kathi/VSC-projects/bda-startup-clustering/bda_cluster_venv/nltk_data'
    - '/Users/kathi/VSC-projects/bda-startup-clustering/bda_cluster_venv/share/nltk_data'
    - '/Users/kathi/VSC-projects/bda-startup-clustering/bda_cluster_venv/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [41]:
# check result
df.head()

Unnamed: 0,Content
0,ariv favorit brandsget appour visionpartnershi...
1,r ckabwickl lebensversicher hol doppelt leb re...
2,stark versicher f r lifestyl held de hom produ...
3,restub airbag freedom safety wat restub skip c...
4,curevac revolutioni mrna f r leb mensch ber be...


In [None]:
# function for tokenization -> only use when vectorizer doesn't include tokenization (probably included)
#def tokenize(text):
#    tokens = re.split(r'\W+', text)

#    return tokens

# applying function to the column
#df['Content'] = df['Content'].apply(lambda x: tokenize(x))
#df

In [43]:
# Specifiy the path to the folder where you want to save the CSV file
folder_path = '../data/preprocessed'

# Specify the file name and full path for the CSV file
file = 'preprocessed_data.csv'
path = os.path.join(folder_path, file)

# Save the preprocessed data to a CSV file
df.to_csv(path, index=False)

In [47]:
# generate requirements.txt without some weird paths instead of the package version
!pip list --format=freeze > requirements_preprocessing.txt