# ToDo:
- get a set of epoch-data based on author birth (20 years before period starts as a criterium) so we can color our data
- rearrange the code to work in a single function, so we can do fast experiments!
- do some experiments with perplexity

# Tsne

In [1]:
import pandas as pd

In [2]:
meta = pd.read_csv("metadata.csv", encoding="utf8", sep=";")
meta.head()

FileNotFoundError: [Errno 2] No such file or directory: 'metadata.csv'

In [None]:
author_table = pd.read_csv("src/author_table.tsv", sep="\t", encoding = "utf8")
author_table.head()

In [None]:
author_table["Geburtsjahr"].isnull().values.any()

In [None]:
author_table["Geburtsjahr"].describe()

In [None]:
periods=[]

for author in list(meta["author"]):
    birthyear = author_table["Geburtsjahr"].loc[author_table["Voller_Name"]==author]
    
    if int(birthyear.iloc[0])<1811:
        periods.append("Kunstepoche")
    elif (int(birthyear.iloc[0]) >= 1811) & (int(birthyear.iloc[0]) <=1830):
        periods.append("Vormärz")
    elif (int(birthyear.iloc[0]) > 1831) & (int(birthyear.iloc[0]) <=1870):
        periods.append("Realismus")
    else:
        periods.append("Moderne")
    
len(periods)

In [None]:
genders=[]

for author in list(meta["author"]):
    gender = author_table["Geschlecht"].loc[author_table["Voller_Name"]==author]
    
    genders.append(gender.iloc[0])
    
len(genders)

In [None]:
len(meta)

In [None]:
corpus = []

for file in list(meta["filename"]):
    with open("src/corpus_nostopwords/" + file, encoding="utf8") as f:
        corpus.append(f.read())

In [None]:
len(corpus)

## tf-ifd

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer(min_df=20,
                             # max_df=100,
                             lowercase=True,
                             max_features=20000
                             )

X = vectorizer.fit_transform(corpus)

## Tsne

In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
import plotly.express as px

In [None]:
def make_tsne(input_matrix, dim_reduction, n_perplexity, n_iterations):
    
    # dimension reduction
    svd = TruncatedSVD(n_components=dim_reduction, random_state=42)
    data = svd.fit_transform(input_matrix)
    
    X_embedded = TSNE(n_components=2,
                  learning_rate='auto',
                  init='random',
                  n_iter=n_iterations,
                  perplexity=n_perplexity).fit_transform(data)
    
    # building the dataset
    df=pd.DataFrame()
    df["title"]=list(meta["title"])
    df["author"]=list(meta["author"])
    df["period"]=periods
    df["gender"]=genders
    df=df.join(pd.DataFrame(X_embedded))
    df=df.rename(columns={0:"Dim 1", 1:"Dim 2"})
    
    # plotting
    # title="dim: " + str(dim_reduction) + ", perp: " + str(n_perplexity) + ", iter: " + str(n_iterations)
    title = "Fig 4: A first exploration of the corpus based on MFW and gender"
    
    fig = px.scatter(df,
                     x="Dim 1",
                     y="Dim 2",
                     hover_data=["title", "author", "period"],
                     color="gender",
                     color_discrete_sequence=['#053c6c', '#9d95b0' ],
                     title=title)
    
    fig.update_layout({"plot_bgcolor":"#ffbb91", "paper_bgcolor":"#feba67", "height":750, "width":1000,},
                      xaxis=dict(showgrid=False),
                      yaxis=dict(showgrid=False),
                      title_font_color="#122630",
                      title=dict(font=dict(size=28), yref='paper')
                      )
    
    fig.show()

In [None]:
make_tsne(X, 2000, 10, 5000)

In [None]:
# for dim in [500, 1000, 1500, 2000]:    
#     for perp in [10, 30, 50, 70]:
#         for iteration in [1000, 2000, 3000, 4000, 5000]:
#             make_tsne(X, dim, perp, iteration)