In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
%cd /content/drive/My Drive/research/traffic
!ls

/content/drive/My Drive/research/traffic
 address.txt			        my_spacy_annotation.ipynb
 annotations.json		        my_tokenizer_module.ipynb
 best_sentiment_analysis_pipeline.pkl   my_traffic_class_module.ipynb
 config.cfg			        my_word_cloud_module.ipynb
 data.csv			       'NLP for Traffic Mgt.ipynb'
 model-best			        __pycache__
 model-last			        selected_data.txt
 my_clean_text_module.ipynb	        show_traffic_levels.ipynb
 my_coordinates_module.ipynb	        testing.ipynb
 mydata.txt			        training.spacy
 my_sentiment_module.ipynb


In [None]:
import pandas as pd

# Load data from a CSV file
df = pd.read_csv('data.csv', usecols=["date", "content"]).dropna()

# Preprocess the data by removing URLs, mentions, and hashtags
# df['text'] = df['text'].str.replace('http\S+|www.\S+|@\S+|#\S+', '', case=False)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181 entries, 0 to 180
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   date     181 non-null    object
 1   content  181 non-null    object
dtypes: object(2)
memory usage: 3.0+ KB


In [None]:
df = df.explode("content")

In [None]:
df.head()

Unnamed: 0,date,content
0,2023-02-09 07:12:43+00:00,"Acme junction inward Agidingbi, Cadbury juncti..."
1,2023-02-09 07:18:48+00:00,Approaching Lekki conservation centre is busy ...
2,2023-02-09 07:18:49+00:00,Connecting Marwa from Ikate is good Marwa back...
3,2023-02-09 07:29:07+00:00,Cele inward Cele/ijesha fly over bridge is bus...
4,2023-02-09 07:29:08+00:00,Movement from Cele inward Ago roundabout is bu...


In [None]:
df.content[0]

'Acme junction inward Agidingbi, Cadbury junction, Mobil junction, daily times down to Coca-cola junction is good to go.     '

In [None]:
from transformers import pipeline

# Load the pre-trained GPT-2 model
model = pipeline('text-generation', model='gpt2')

# Train the model on the preprocessed traffic tweets
with open('preprocessed_traffic_tweets.csv', 'r') as f:
    for line in f:
        model(line)

ModuleNotFoundError: ignored

In [None]:
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer

# Load the pre-trained model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)

# Convert the preprocessed tweets to input features
tweets = pd.read_csv('preprocessed_traffic_tweets.csv')
inputs = tokenizer(list(tweets['text']), padding=True, truncation=True, max_length=512, return_tensors='tf')

# Define the training data
dataset = tf.data.Dataset.from_tensor_slices((inputs['input_ids'], inputs['attention_mask'], tweets['label'].values))

# Define the training parameters
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Train the model
model.fit(dataset.shuffle(len(tweets)).batch(8), epochs=3)


In [None]:
import numpy as np

# Load the trained model
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)

# Prepare the input text
input_text = "There's a serious accident on the highway."

# Convert the input text to input features
input_ids = tokenizer.encode(input_text, padding=True, truncation=True, max_length=512, return_tensors='tf')
input_mask = tf.ones_like(input_ids)
inputs = {'input_ids': input_ids, 'attention_mask': input_mask}

# Make a prediction
logits = model(inputs)[0]
prediction = np.argmax(logits, axis=1)[0]

# Map the prediction to the corresponding class label
class_labels = ['accidents', 'free-flow', 'mild traffic', 'breakdown']
predicted_class = class_labels[prediction]


In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# Define the 5 different traffic levels
traffic_levels = {
    'accident': ['accident', 'crash', 'collision', 'pile-up', "accident", "casualty", "disaster", "mishap", "catastrophe", "tragedy", "mischance", "misfortune", "collision", "skidded off"],
    'breakdown': ['breakdown', 'malfunction', 'engine failure', "break", "broken", "break", "break down", "evacuation"],
    'free-flow': ['free-flow', 'smooth traffic', 'no delays', "good", "okay", "gd", "free", "encouraging", "gtg"],
    'mild traffic': ['mild traffic', 'slight delays', 'slow-moving traffic', "fair","not so bad","bit slow", "better"],
    'heavy traffic': ['heavy traffic', 'congestion', 'gridlock', "busy", "bad road", "bad narrow", "bad path", "traffic jam", "bottle-necked", "high side", "Impediment", "negative", "slow", "bad path"]
}


# def classify_tweet(tweet_content):

#   # Tokenize the tweet text
#   tokens = word_tokenize(tweet_content.lower())

#   # Remove stop words from the tokens
#   stop_words = set(stopwords.words('english'))
#   tokens = [token for token in tokens if not token in stop_words]

#   # Return the tokens as a list of strings
#   return tokens

# # Create a new dictionary with string values
# traffic_levels_str = {}
# for key, value in traffic_levels.items():
#     traffic_levels_str[key] = ' '.join(value)

# # Use the new dictionary in TfidfVectorizer
# vectorizer = TfidfVectorizer(vocabulary=traffic_levels_str, tokenizer=classify_tweet)

# vectorizer.fit_transform(df['content'])
# print(vectorizer.vocabulary_)





# Define a function to classify a tweet based on its content
def classify_tweet(tweet_content):
  
  # Tokenize the tweet text
  tokens = word_tokenize(tweet_content.lower())

  # Remove stop words from the tokens
  stop_words = set(stopwords.words('english'))
  tokens = [token for token in tokens if not token in stop_words]

  # Map the tokens to the traffic levels
  level = 'unknown'
  for key, value in traffic_levels.items():
    if any(token in value for token in tokens):
      level = key
      break

  # Return the level as a string
  return level

# Apply the classify_tweet function to the 'content' column of the dataframe
df['level'] = df['content'].apply(classify_tweet)

# Create a new dictionary with string values
traffic_levels_str = {}
for key, value in traffic_levels.items():
    traffic_levels_str[key] = ' '.join(value)

# Use the new dictionary in TfidfVectorizer
vectorizer = TfidfVectorizer(vocabulary=traffic_levels_str, tokenizer=classify_tweet)

# Transform the 'content' column of the dataframe using the TfidfVectorizer
features = vectorizer.fit_transform(df['content'])

# Define the labels for the machine learning algorithm
labels = df['level']

In [None]:
import gmaps
import pandas as pd
from sklearn.cluster import DBSCAN

# Define the center of the map
center = (data['lat'].mean(), data['lng'].mean())

# Create a numpy array of latitudes and longitudes
locations = data[['lat', 'lng']].values

# Use DBSCAN to cluster the points
dbscan = DBSCAN(eps=0.01, min_samples=10)
clusters = dbscan.fit_predict(locations)

# Calculate the density of each cluster
cluster_counts = dict()
for cluster_id in set(clusters):
    if cluster_id == -1:
        continue
    cluster_locations = locations[clusters == cluster_id]
    density = len(cluster_locations) / (gmaps.distance_matrix(cluster_locations, cluster_locations).min(axis=1).mean()**2)
    cluster_counts[cluster_id] = density

# Create the map figure
fig = gmaps.figure(center=center, zoom_level=12)

# Create a list of colors for the clusters
# colors = ['red', 'green', 'blue', 'yellow', 'purple', 'orange', 'pink', 'brown', 'gray', 'olive']
colors = ['rgba(255, 0, 0, 0)', 'rgba(255, 0, 0, 1)', 'rgba(255, 191, 0, 1)', 'rgba(255, 255, 0, 1)', 'rgba(0, 255, 0, 1)']
legend_entries = [(colors[i], str(i)) for i in range(len(colors))]


# Create a list of markers for each cluster
markers = []
for cluster_id, color in zip(set(clusters), colors):
    if cluster_id == -1:
        color = 'black'
        continue  # Skip noise cluster
    if cluster_id not in cluster_counts:  # Skip cluster if it has no density value
        continue
    cluster_locations = locations[clusters == cluster_id]
    if len(cluster_locations) == 0:
        continue
    marker_layer = gmaps.symbol_layer(
        cluster_locations,
        fill_color=color,
        stroke_color=color,
        scale=2,
        info_box_content=str(cluster_counts[cluster_id]),
    )
    markers.append(marker_layer)


# Add the marker layers to the map
fig.add_layer(gmaps.heatmap_layer(locations))
for marker_layer in markers:
    fig.add_layer(marker_layer)

# Create the legend
legend_markers = []
for color, label in legend_entries:
    legend_markers.append(gmaps.Layer(
        gmaps.Symbol(
            location=(0, 0),
            stroke_color=color,
            fill_color=color,
            scale=2,
            label=label,
            anchor=(0, -2)
        )
    ))
    
legend = gmaps.Map(layout={'width': '400px', 'height': '300px'})
legend.layers = legend_markers

fig.add_layer(gmaps.symbol_layer(
    locations, fill_color=colors, stroke_color=colors, scale=2, 
    info_box_content=locations_info, display_info_box=True))
fig.add_layer(legend)


# Display the map
fig
