In [1]:
import time, pandas as pd
from lm.markov.models import LM

data = pd.read_csv("A3_csv/train_essays.csv")

# Separate essays about cars and cities respectively
cars = data[(data.generated==0)&(data.prompt_id==0)]
cities = data[(data.generated==0)&(data.prompt_id==1)]

"""
@inproceedings{pavlopoulos2021customized,
  title={Customized Neural Predictive Medical Text: A Use-Case on Caregivers},
  author={Pavlopoulos, John and Papapetrou, Panagiotis},
  booktitle={Artificial Intelligence in Medicine: 19th International Conference on Artificial Intelligence in Medicine, AIME 2021, Virtual Event, June 15--18, 2021, Proceedings},
  pages={438--443},
  year={2021},
  organization={Springer}
}
"""
# Initialize the LM statistic model
lm = LM()



In [2]:
car_corpus = " \n ".join(cars.text.values)
cities_corpus = " \n ".join(cities.text.values)

In [3]:


def generate_essays(corpus: pd.DataFrame, lm):
    # Calculate the mean word-count length on the cars dataset
    mean_word_count = corpus.text.apply(lambda x: len(x.split())).mean()

    # Round the number obtained
    word_count = round(mean_word_count)

    lm_essays = [None] * 700 # Initialize empty list with specific length

    for text in range(700):

        lm_essay = lm.generate_text(grams_num=word_count) # Generate the average length of ngrams in corpus

        # Store the LM essay
        lm_essays[text] = lm_essay

    
    
    # Store the results in a DF
    lm_df = pd.DataFrame(columns=["text"], data=lm_essays)
    display(lm_df.sample(3))
    
    return lm_df
    
    
    
# Train on the essays about cars corpus
lm.train(car_corpus)
# Generate 400 essays about cars based on training
lm_cars_df = generate_essays(cars, lm)


# Train on the essays about cities corpus
lm.train(cities_corpus)
# Generate 400 essays about cities based on training
lm_cities_df= generate_essays(cities, lm)

Unnamed: 0,text
45,Cars improves as bloomed that faster they worr...
551,Cars is a hole only kind of. By banner beautif...
646,"Cars car frience. ""It's and new, and get examp..."


Unnamed: 0,text
146,The around irrations and not a sing the people...
0,"The people...."" There. I have electoral Colleg..."
548,The populate when to vote an im ther restores ...


In [4]:
import pandas as pd,uuid, numpy as np

def add_cols(lm_df: pd.DataFrame):
    # Add generated column equal to one
    lm_df['generated'] = np.ones(len(lm_df), dtype=int)

    # Add ids for the LM essays
    lm_df['essay_id'] = [str(uuid.uuid4())[:8] for _ in range(len(lm_df))]

    
    # Visualize changes
    display(lm_df.sample(3))
    
    return lm_df

lm_cars_df = add_cols(lm_cars_df)
lm_cities_df = add_cols(lm_cities_df)

lm_cars_df["prompt_id"] = np.zeros(len(lm_cars_df), dtype=int)
lm_cities_df["prompt_id"] = np.ones(len(lm_cities_df), dtype=int) 


# Visualize changes
display(lm_cars_df.sample(3))
display(lm_cities_df.sample(3))

Unnamed: 0,text,generated,essay_id
648,"Cars"", the city have less pollution caused and...",1,af80d0de
327,"Cars imporanaports one's, the environment of s...",1,6d1163b4
398,"Cars with sidewalkway,"" said their new rescind...",1,d7b33b5a


Unnamed: 0,text,generated,essay_id
83,The or voters entire asse that in Congress is ...,1,bc81f6a9
634,"The represident, and voting them, the US, even...",1,1e826f86
141,The Electors not why the Founding to have or c...,1,a89fd101


Unnamed: 0,text,generated,essay_id,prompt_id
427,"Cars die a day."" Withouse to take if your envi...",1,5af94fd4,0
88,Cars that weighborhood. This ever had be nicer...,1,5964a153,0
421,"Cars one, sort fad have buying to pay fixing a...",1,203b402a,0


Unnamed: 0,text,generated,essay_id,prompt_id
605,The electoral College voting crisis in Ohio an...,1,38552e6f,1
446,The Electors thats to popular vote. I believe ...,1,4ad7457d,1
44,The Electoral can even with the eligible the l...,1,c8865c0b,1


In [6]:
lm_cars_df.to_csv("A3_csv/lm_generated_cars.csv", index=False)
lm_cities_df.to_csv("A3_csv/lm_generated_cities.csv", index=False)