# SLO Twitter Data Analysis  - Nan/Non-NaN Values

Setup the Jupyter Notebook kernel for SLO data analysis.

In [1]:
import logging as log
import warnings
import time
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns

# Import custom utility functions.
import slo_twitter_data_analysis_utility_functions as tweet_util_v2

#############################################################
# Adjust parameters to display all contents.
pd.options.display.max_rows = None
pd.options.display.max_columns = None
pd.options.display.width = None
pd.options.display.max_colwidth = 1000
# Seaborn setting.
sns.set()
# Set level of precision for float value output.
pd.set_option('precision', 12)
# Ignore these types of warnings - don't output to console.
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
# Matplotlib log settings.
mylog = log.getLogger("matplotlib")
mylog.setLevel(log.INFO)

"""
Turn debug log statements for various sections of code on/off.
(adjust log level as necessary)
"""
log.basicConfig(level=log.INFO)


# Import CSV dataset and convert to dataframe.
tweet_dataframe = tweet_util_v2.import_dataset(
    "D:/Dropbox/summer-research-2019/jupyter-notebooks/attribute-datasets/"
    "twitter-dataset-7-10-19-with-irrelevant-tweets-excluded.csv",
    "csv", False)

## NaN versus non-Nan Counts for each Attribute in the Twitter dataset:


This function displays statistics that count the # of rows/examples in the dataset that are NaN or non-Nan using the Pandas ".isnull().sum()" function chain.<br>



In [2]:
def count_nan_non_nan(input_file_path, attribute_name_list, file_type):
    """
    Function counts the number of NaN and non-Nan examples in a Pandas dataframe for the specified columns.

    :param input_file_path: absolute file path of the dataset in CSV or JSON format.
    :param attribute_name_list:  list of names of the attributes we are analyzing.
    :param file_type: type of input file. (JSON or CSV)
    :return: None.
    """
    start_time = time.time()

    if file_type == "csv":
        twitter_data = pd.read_csv(f"{input_file_path}", sep=",", encoding="ISO-8859-1", dtype=object)
    elif file_type == "json":
        twitter_data = pd.read_json(f"{input_file_path}", orient='records', lines=True)
    else:
        print(f"Invalid file type entered - aborting operation")
        return

    # Create a empty Pandas dataframe.
    dataframe = pd.DataFrame(twitter_data)

    number_examples = dataframe.shape[0]
    number_attributes = dataframe.shape[1]
    print(f"\nThe number of rows (examples) in the dataframe is {number_examples}")
    print(f"The number of columns (attributes) in the dataframe is {number_attributes}\n")

    for attribute_name in attribute_name_list:
        null_examples = dataframe[attribute_name].isnull().sum()
        non_null_examples = number_examples - null_examples

        print(f"The number of NaN rows for \"{attribute_name}\" is {null_examples}")
        print(f"The number of non-NaN rows for \"{attribute_name}\" is {non_null_examples}\n")

    end_time = time.time()
    time_elapsed_seconds = end_time - start_time
    time_elapsed_minutes = (end_time - start_time) / 60.0
    time_elapsed_hours = (end_time - start_time) / 60.0 / 60.0
    log.debug(f"The time taken to visualize the statistics is {time_elapsed_seconds} seconds, "
              f"{time_elapsed_minutes} minutes, {time_elapsed_hours} hours")


The usual data analysis function call.<br>



In [4]:
# Original Tweet object attribute names present in raw JSON file.
original_tweet_object_field_names = [
    'created_at', 'id', 'full_text', 'in_reply_to_status_id', 'in_reply_to_user_id',
    'in_reply_to_screen_name', 'retweet_count', 'favorite_count', 'lang']

# Names to rename main Tweet object attributes.
tweet_object_fields = [
    'tweet_created_at', 'tweet_id', 'tweet_full_text', 'tweet_in_reply_to_status_id',
    'tweet_in_reply_to_user_id', 'tweet_in_reply_to_screen_name', 'tweet_retweet_count',
    'tweet_favorite_count', 'tweet_lang']

# Names to give "user" object attributes.
user_object_fields = [
    'user_id', 'user_name', 'user_screen_name', 'user_location', 'user_description',
    'user_followers_count', 'user_friends_count', 'user_listed_count', 'user_favourites_count',
    'user_statuses_count', 'user_created_at', 'user_time_zone', 'user_lang']

# Names to give "entities" object attributes.
entities_object_fields = [
    "tweet_entities_expanded_urls", "tweet_entities_hashtags", "tweet_entities_user_mentions_id",
    "tweet_entities_user_mentions_name", "tweet_entities_user_mentions_screen_name",
    "tweet_entities_symbols"]

# Names to give "retweeted_status" object attributes.
retweeted_status_object_fields = [
    'retweeted_status_created_at', 'retweeted_status_id', 'retweeted_status_full_text',
    'retweeted_status_in_reply_to_status_id', 'retweeted_status_in_reply_to_user_id',
    'retweeted_status_in_reply_to_screen_name', 'retweeted_status_retweet_count',
    'retweeted_status_favorite_count', 'retweeted_status_lang',
    'retweeted_status_entities',
    'retweeted_status_user', 'retweeted_status_coordinates', 'retweeted_status_place']

# Names to give "user" object attributes.
retweeted_status_user_object_fields = [
    'retweeted_status_user_id', 'retweeted_status_user_name', 'retweeted_status_user_screen_name',
    'retweeted_status_user_location', 'retweeted_status_user_description', 'retweeted_status_user_followers_count',
    'retweeted_status_user_friends_count', 'retweeted_status_user_listed_count',
    'retweeted_status_user_favourites_count', 'retweeted_status_user_statuses_count',
    'retweeted_status_user_created_at', 'retweeted_status_user_time_zone', 'retweeted_status_user_lang']

# Modify these to determine what to export to CSV.
required_fields = ['retweeted_derived', 'company_derived', 'text_derived',  # "tweet_quoted_status_id",
                   'tweet_url_link_derived', 'multiple_companies_derived_count', "company_derived_designation",
                   'tweet_text_length_derived', "spaCy_language_detect_all_tweets",
                   "user_description_text_length",  # "polyglot_lang_detect_all_tweets"
                   ] + tweet_object_fields + user_object_fields + entities_object_fields

extra_fields = ["tweet_id"] + retweeted_status_object_fields
        
# Determine the number of NaN and non-NaN rows for a attribute in a dataset.
count_nan_non_nan(
    "D:/Dropbox/summer-research-2019/jupyter-notebooks/attribute-datasets/twitter-dataset-7-10-19-with-irrelevant-tweets-excluded.csv",
    required_fields, "csv")


The number of rows (examples) in the dataframe is 654618
The number of columns (attributes) in the dataframe is 37

The number of NaN rows for "retweeted_derived" is 0
The number of non-NaN rows for "retweeted_derived" is 654618

The number of NaN rows for "company_derived" is 0
The number of non-NaN rows for "company_derived" is 654618

The number of NaN rows for "text_derived" is 0
The number of non-NaN rows for "text_derived" is 654618

The number of NaN rows for "tweet_url_link_derived" is 0
The number of non-NaN rows for "tweet_url_link_derived" is 654618

The number of NaN rows for "multiple_companies_derived_count" is 0
The number of non-NaN rows for "multiple_companies_derived_count" is 654618

The number of NaN rows for "company_derived_designation" is 0
The number of non-NaN rows for "company_derived_designation" is 654618

The number of NaN rows for "tweet_text_length_derived" is 0
The number of non-NaN rows for "tweet_text_length_derived" is 654618

The number of NaN rows 


The attribute name is in double quotations.  Each pair of lines in between a blank line is the statistics for a single attribute.<br>



In [5]:
# Determine the number of NaN and non-NaN rows for a attribute in a dataset.
count_nan_non_nan(
    "D:/Dropbox/summer-research-2019/jupyter-notebooks/attribute-datasets/twitter-dataset-7-10-19-with-irrelevant-tweets-excluded-extra.csv",
    extra_fields, "csv")


The number of rows (examples) in the dataframe is 654618
The number of columns (attributes) in the dataframe is 14

The number of NaN rows for "tweet_id" is 0
The number of non-NaN rows for "tweet_id" is 654618

The number of NaN rows for "retweeted_status_created_at" is 214705
The number of non-NaN rows for "retweeted_status_created_at" is 439913

The number of NaN rows for "retweeted_status_id" is 214705
The number of non-NaN rows for "retweeted_status_id" is 439913

The number of NaN rows for "retweeted_status_full_text" is 214705
The number of non-NaN rows for "retweeted_status_full_text" is 439913

The number of NaN rows for "retweeted_status_in_reply_to_status_id" is 620672
The number of non-NaN rows for "retweeted_status_in_reply_to_status_id" is 33946

The number of NaN rows for "retweeted_status_in_reply_to_user_id" is 615174
The number of non-NaN rows for "retweeted_status_in_reply_to_user_id" is 39444

The number of NaN rows for "retweeted_status_in_reply_to_screen_name" is