In [None]:
import pymc as pm
import matplotlib.pyplot as plt
import arviz as az
import pandas as pd
from scipy import special, stats
import numpy as np
import seaborn as sns
from cycler import cycler
from patsy import bs, dmatrix
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import wikipedia
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import json

from google.colab import files
uploaded = files.upload()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Saving TrainNarratives.txt to TrainNarratives (1).txt


In [None]:
stopWords = set(stopwords.words('english'))

def preprocessor(text):

    tokens = nltk.word_tokenize(text)
    return (" ").join([word for word in tokens if word not in stopWords])

In [None]:
class LDA_trains:
    """Creates a class for Latent Dirichlet Allocation using summaries from train reports
    Input:
        reports = list of narratives from accident reports
        N_topics = number of topics for LDA to produce
        N_words = the number of words to show in a topic
        new_report = narrative for a new accident report not in the training set
    Methods:
        Topics = Outputs the list of topics in the selected narratives as a dataframe
        Predict_Topics
            Input: New narratives in a list
            Output: A dataframe with the probabilities for topics for each new narrative"""

    def __init__(self, reports, N_topics=3, N_words = 10):
        # the narrative reports
        self.reports = reports
        # initialize variables
        self.N_topics = N_topics
        self.N_words = N_words

        # Get the word counts in the reports
        self.countVectorizer = CountVectorizer(stop_words='english')
        self.termFrequency = self.countVectorizer.fit_transform(self.reports)
        self.Words = self.countVectorizer.get_feature_names_out()

    def Topics(self):
        # Obtain the estimates for the LDA model
        self.lda = LatentDirichletAllocation(n_components=self.N_topics)
        self.lda.fit(self.termFrequency)
        topics = list()
        for topic in self.lda.components_:
            topics.append([self.Words[i] for i in topic.argsort()[:-self.N_words - 1:-1]])
        # For each of the topics in the model add the top N_words the list of topics
        ### Your code here
        # Create column names for the output matrix
        cols = list()
        for i in range(self.N_words):
            cols.append("Word "+(str(i)))

        # Create a dataframe with the topic no. and the words in each topic
        # output this dataframe
        ### Your code here
        Topics_df = pd.DataFrame(topics, columns = cols)
        Topics_df.index.name = "Topics"
        return Topics_df

    def Predict_Topics(self, new_reports):
        self.new_reports = new_reports

        # Get the list of new accident report narratives
        # and the number of new narratives
        N_new_reports = len(self.new_reports)
        # For each of the new narratives
        # obtain the estimated probabilities for each of the topics
        # in each of the new narratives as estimated by the LDA results
        # on the training set
        new_report_topics = list()
        ### Your code here
        for report in self.new_reports:
            new_report = report
            new_doc = preprocessor(new_report)
            new_report_topics.append(self.lda.transform(self.countVectorizer.transform([new_doc])))

        # Recast the list of probabilities for topics as an array
        # of size no. of new reports X no. of topics
        new_report_topics = np.array(new_report_topics).reshape(N_new_reports, self.N_topics)

        # Create column names for the output dataframe
        cols = list()
        ### Your code here
        for i in range(self.N_topics):
            cols.append("Topic "+(str(i)))


        # Create the dataframe whose rows contain topic probabilities for
        # specificed narratives/reports
        New_Reports_df = pd.DataFrame(new_report_topics, columns = cols )
        New_Reports_df.insert(0, 'Report Number', range(0, N_new_reports))

        return New_Reports_df

In [None]:
from gensim.models import LdaModel as LDA

# Open the JSON file containing train narratives and load into a dictionary
with open('TrainNarratives.txt') as json_file:
    Narrative_dict = json.load(json_file)

# Convert the narrative dictionary values to a list
train_reports = list(Narrative_dict.values())

# Create an LDA model with the train reports
lda_train = LDA_trains(train_reports, 10)

# Display topics from the LDA model
lda_train_topics = lda_train.Topics()


In [None]:
with open('TrainNarratives.txt') as json_file:
    Narrative_dict = json.load(json_file)

train_reports = list(Narrative_dict.values())
train_reports[0:3];

lda_train = LDA_trains(reports=train_reports, N_topics=10, N_words=10)
lda_train.reports[0];



In [None]:
lda_train.Topics()

Unnamed: 0_level_0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9
Topics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,hazardous,materials,released,track,derailed,railcars,yard,fuel,leaking,rco
1,derailed,cars,rail,train,track,cause,car,derailment,caused,curve
2,car,cars,track,train,derailed,causing,lead,cut,bowl,end
3,train,damage,track,equipment,car,pantograph,struck,mp,causing,engine
4,car,derailed,track,humping,retarder,cars,derailment,causing,load,hump
5,cars,track,car,end,crew,cut,yard,lead,shoving,derailed
6,train,crew,engineer,emergency,conductor,went,cars,derailed,stop,car
7,car,wheels,journal,engine,ns,derailed,wheel,r1,l1,burned
8,derailed,cars,track,pulling,loads,rail,train,empties,head,wide
9,switch,cars,track,derailed,car,crew,point,lead,lined,yard


In [None]:
nltk.download('punkt')

lda_train.Predict_Topics(train_reports[0:10])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,Report Number,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9
0,0,0.004546,0.382436,0.004546,0.243572,0.004546,0.154321,0.004547,0.004546,0.004548,0.192392
1,1,0.009092,0.009095,0.009092,0.492139,0.009092,0.009094,0.435116,0.009094,0.009093,0.009092
2,2,0.002326,0.377888,0.002326,0.002326,0.002326,0.002326,0.184247,0.226247,0.197661,0.002326
3,3,0.006251,0.006251,0.006251,0.254188,0.006251,0.695802,0.006253,0.006251,0.006252,0.006252
4,4,0.01,0.010001,0.367225,0.552764,0.01,0.010001,0.010003,0.010001,0.010003,0.010001
5,5,0.167705,0.005001,0.005,0.792289,0.005002,0.005,0.005002,0.005,0.005,0.005
6,6,0.002273,0.002273,0.002273,0.163451,0.168543,0.652094,0.002273,0.002273,0.002273,0.002273
7,7,0.002083,0.002084,0.002084,0.002084,0.002084,0.645872,0.337459,0.002084,0.002084,0.002084
8,8,0.003847,0.003847,0.003847,0.003847,0.003846,0.003847,0.065291,0.003846,0.003847,0.903936
9,9,0.007693,0.007693,0.007693,0.106397,0.007693,0.007693,0.007693,0.007693,0.230062,0.609689


In [None]:
top_reports = train_reports[:10]
top_reports

['UNITS 231-281(BACK TO BACK)  WERE COMING INTO UP DEISEL SHOP  WHEN THE LEFT WHEEL OF 281 RODE OVER RECENTLY REPAIRED SWITCH PLATE AND DERAILED. THE CAUSE WAS DETERMINED TO BE THE TRACK TELEMETRY IN THAT IT WAS TOO SHARP OF A CURVE.',
 'ENGINE 286 CAUGHT FIRE AT THE SPRINGFIELD, MA STATION DUE TO BEARINGS IN MAIN GENERATOR LET GO.',
 'TRAIN NO.#4 WITH ENGS 83/11/90/44 AND 11 CARS DERAILED 2 DEADHEAD CARS, C/44834 AND C/9639, WHILE MAKING A SHOVING MOVE ONTO TRACK 28.  THE DERAILMENT WAS DUE TO HIGH BUFF FORCES CAUSED JACKKNIFING OFDEADHEADING AMFLEET CAR 44834 LOCATED DIRECTLY BEHIND ENGINES DUE TO EXCESSIVE AMPERAGE GENERATED BY FOUR P42 LOCOMOTIVES SHOVING TRAIN AGAINST AN APPROXIMATELY 15-POUND BRAKE REDUCTION.',
 'WHILE SHOVING TRAIN 624 SOUTH ON #30 TRACK AT PENN COACH YARD, CONDUCTOR FAILED TO STOP FOR A DERAILAND 2 CARS DERAILED .',
 'TRAIN 786 WAS STRUCK BY A FALLING TREE SOUTH OF SANTA BARBARA, CA.',
 'ENGINE 4403 OF NJT TRAIN 3204 HAD 90% OLD BREAK ON SHOE ASSEMBLY TRUCHION 

Based on the top entries we retrievied, we can see the most infamous topics include train delraimnets that leaked hazardous chemicals, other train delailments, construction accidents, and car crashes. This information could help engineers in relevant departmentss to better prioritize their spending and better decide how to allocate funds to deal with these kind of issues. They could, for example, use these reports as reasoning to preasure traffic and safety authroities across the country to raise safety to better deal with the threats posed by failing to maintain rail infrastructure.