# Single-Company versus Multi-Company Associated Tweets

Setup the Jupyter Notebook kernel for SLO data analysis.

In [1]:
import logging as log
import warnings
import time
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns

# Import custom utility functions.
import slo_twitter_data_analysis_utility_functions as tweet_util_v2

#############################################################
# Adjust parameters to display all contents.
pd.options.display.max_rows = None
pd.options.display.max_columns = None
pd.options.display.width = None
pd.options.display.max_colwidth = 1000
# Seaborn setting.
sns.set()
# Set level of precision for float value output.
pd.set_option('precision', 12)
# Ignore these types of warnings - don't output to console.
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
# Matplotlib log settings.
mylog = log.getLogger("matplotlib")
mylog.setLevel(log.INFO)

"""
Turn debug log statements for various sections of code on/off.
(adjust log level as necessary)
"""
log.basicConfig(level=log.INFO)


# Import CSV dataset and convert to dataframe.
tweet_dataframe = tweet_util_v2.import_dataset(
    "D:/Dropbox/summer-research-2019/jupyter-notebooks/attribute-datasets/"
    "twitter-dataset-7-10-19-with-irrelevant-tweets-excluded.csv",
    "csv", False)

## Single or Multiple Companies:


This function displays the number of Tweets associated with only one company or associated with multiple companies.



In [2]:
def tweets_number_associated_companies(tweet_dataframe):
    """
    Function displays statistics on the # of Tweets associated with one versus multiple companies.

    :param tweet_dataframe: the Twitter dataset in a Pandas dataframe.
    :return: None.
    """
    # Number of rows in entire dataframe.
    number_rows_total = tweet_dataframe.shape[0]
    print(f"The # of Tweets in total is {number_rows_total}")

    has_company = pd.DataFrame(tweet_dataframe.loc[tweet_dataframe['company_derived_designation'].notnull()])
    print(f"The # of Tweets associated with at least one company is {has_company.shape[0]}")

    # Select only rows with one associated company. (don't graph company combos)
    single_company_only_df = has_company.loc[
        (has_company['company_derived_designation'].str.contains("multiple") == False)]

    # Number of rows associated with only one company.
    number_rows_one_company = single_company_only_df.shape[0]

    print(f"The # of Tweets associated with multiple companies is {has_company.shape[0] - number_rows_one_company}")
    print(f"The # of Tweets associated with one company is {number_rows_one_company}")
    print(f"The # of Tweets associated with no company is {number_rows_total - has_company.shape[0]}")

    percent_single = number_rows_one_company / number_rows_total * 100.0
    percent_multiple = (number_rows_total - number_rows_one_company) / number_rows_total * 100.0

    print(f"The percentage of the dataset associated with a single company is {percent_single}%")
    print(f"The percentage of the dataset associated with multiple companies is {percent_multiple}%")


Call the data analysis function.<br>



In [3]:
    # Tweets associated with one or multiple companies.
    tweets_number_associated_companies(tweet_dataframe)

The # of Tweets in total is 654618
The # of Tweets associated with at least one company is 654618
The # of Tweets associated with multiple companies is 8205
The # of Tweets associated with one company is 646413
The # of Tweets associated with no company is 0
The percentage of the dataset associated with a single company is 98.74659725213789%
The percentage of the dataset associated with multiple companies is 1.253402747862112%



The majority of our Tweets are only associated with a single company.  However, a small percentage are associated with multiple companies.<br>



## Export multi-company associated Tweets to separate CSV file:

    
Function isolates and exports to a new CSV file all Tweets that are associated with more than one company.<br>



In [4]:
    # Isolate multi-company associated Tweets for data analysis and export to new CSV file.
    tweet_util_v2.export_multi_company_tweets(tweet_dataframe)

    
Import the resulting CSV file and insert into a Pandas Dataframe for data analysis.<br>



In [6]:
    # Import CSV dataset and convert to dataframe.
    multi_company_tweets_df = tweet_util_v2.import_dataset(
        "D:/Dropbox/summer-research-2019/jupyter-notebooks/attribute-datasets/multi-company-tweets-7-10-19.csv",
        "csv", False)

    
The shape, column (attribute) names, and the first Tweet in the dataframe will be outputted if the last parameter in the function call is set to "True".<br>

Note: Dataframe info disabled at the moment.  Change boolean from "false" to "true", if desired.<br>



## Simple Analysis using Pandas.describe():


Statistics for every attribute in the CSV dataset containing just multi-company associated Tweets.<br>



In [7]:
def attribute_describe(input_file_path, attribute_name_list, file_type):
    """
    Function utilizes Pandas "describe" function to return dataframe statistics.

    https://chrisalbon.com/python/data_wrangling/pandas_dataframe_descriptive_stats/

    Note: This function will not work for attributes whose values are "objects" themselves.
    (can only be numeric type or string)

    :param input_file_path: absolute file path of the dataset in CSV or JSON format.
    :param attribute_name_list:  list of names of the attributes we are analyzing.
    :param file_type: type of input file. (JSON or CSV)
    :return: None.
    """
    start_time = time.time()

    if file_type == "csv":
        twitter_data = pd.read_csv(f"{input_file_path}", sep=",")
    elif file_type == "json":
        twitter_data = pd.read_json(f"{input_file_path}", orient='records', lines=True)
    else:
        print(f"Invalid file type entered - aborting operation")
        return

    # Create a empty Pandas dataframe.
    dataframe = pd.DataFrame(twitter_data)

    if len(attribute_name_list) > 0:
        for attribute_name in attribute_name_list:
            print(f"\nPandas describe() for \"{attribute_name}\":\n")
            print(dataframe[attribute_name].describe(include='all'))
    else:
        print(f"\nPandas describe() for the entire dataframe/dataset:\n")
        print(dataframe.describe(include='all'))

    end_time = time.time()
    time_elapsed_seconds = end_time - start_time
    time_elapsed_minutes = (end_time - start_time) / 60.0
    time_elapsed_hours = (end_time - start_time) / 60.0 / 60.0
    log.debug(f"The time taken to visualize the statistics is {time_elapsed_seconds} seconds, "
              f"{time_elapsed_minutes} minutes, {time_elapsed_hours} hours")


Call the above function.<br>



In [11]:
    # Analyze the multi-company associated Tweets.
    attribute_describe("D:/Dropbox/summer-research-2019/jupyter-notebooks/attribute-datasets/"
                       "data-analysis-datasets/multi-company-tweets-7-10-19.csv",
                       [], "csv")


Pandas describe() for the entire dataframe/dataset:

       retweeted_derived company_derived  \
count               8205            8205   
unique                 2              80   
top                 True    bhp|riotinto   
freq                4662            3573   
mean                 NaN             NaN   
std                  NaN             NaN   
min                  NaN             NaN   
25%                  NaN             NaN   
50%                  NaN             NaN   
75%                  NaN             NaN   
max                  NaN             NaN   

                                                                                                                                                                        text_derived  \
count                                                                                                                                                                           8205   
unique                                           


After looking at the Tweets directly in Microsoft Excel by loading the CSV dataset file, we have decided to include multi-company associated Tweets in our SLO Twitter data analysis statistics and graphs for the various sections listed in the Jupyter Notebook file titled "slo-twitter-data-analysis-table-of-contents.ipynb".<br>



## Count of Tweets Assumed to be Stock Associated Tweets:

In [12]:
    # Determine count of stock symbols.
    multi_company_tweets_df['#symbols'] = multi_company_tweets_df.text_derived.str.findall(r"\$\w+").apply(len)

    # Tweets with over 2 company assignments and possessing stock symbols are assumed to be stock Tweets.
    assumed_stock_tweets = multi_company_tweets_df.loc[
        (multi_company_tweets_df["multiple_companies_derived_count"] > 2) & (multi_company_tweets_df['#symbols'] > 0)]
    print(f"The # of multi-company associated Tweets assumed to be stock Tweets is {assumed_stock_tweets.shape[0]}")
    print(f"The percentage of multi-company associated Tweets assumed to be stock Tweets is "
          f"{assumed_stock_tweets.shape[0] / multi_company_tweets_df.shape[0] * 100}%")
    print(f"Note: This is based on the conditions that the # of associated companies is greater than 2 and "
          f"there are stock symbols found in the Tweet.")

The # of multi-company associated Tweets assumed to be stock Tweets is 122
The percentage of multi-company associated Tweets assumed to be stock Tweets is 1.4868982327848872%
Note: This is based on the conditions that the # of associated companies is greater than 2 and there are stock symbols found in the Tweet.


## Simple Check for non-Company associated Tweets:


The code provides a means by which we can use Pandas to check for any Tweets in our dataset which has a value or lacks a value for a specific attribute or set of attributes.<br>



In [13]:
def count_nan_non_nan(input_file_path, attribute_name_list, file_type):
    """
    Function counts the number of NaN and non-Nan examples in a Pandas dataframe for the specified columns.

    :param input_file_path: absolute file path of the dataset in CSV or JSON format.
    :param attribute_name_list:  list of names of the attributes we are analyzing.
    :param file_type: type of input file. (JSON or CSV)
    :return: None.
    """
    start_time = time.time()

    if file_type == "csv":
        twitter_data = pd.read_csv(f"{input_file_path}", sep=",", encoding="ISO-8859-1", dtype=object)
    elif file_type == "json":
        twitter_data = pd.read_json(f"{input_file_path}", orient='records', lines=True)
    else:
        print(f"Invalid file type entered - aborting operation")
        return

    # Create a empty Pandas dataframe.
    dataframe = pd.DataFrame(twitter_data)

    number_examples = dataframe.shape[0]
    number_attributes = dataframe.shape[1]
    print(f"\nThe number of rows (examples) in the dataframe is {number_examples}")
    print(f"The number of columns (attributes) in the dataframe is {number_attributes}\n")

    for attribute_name in attribute_name_list:
        null_examples = dataframe[attribute_name].isnull().sum()
        non_null_examples = number_examples - null_examples

        print(f"The number of NaN rows for \"{attribute_name}\" is {null_examples}")
        print(f"The number of non-NaN rows for \"{attribute_name}\" is {non_null_examples}\n")

    end_time = time.time()
    time_elapsed_seconds = end_time - start_time
    time_elapsed_minutes = (end_time - start_time) / 60.0
    time_elapsed_hours = (end_time - start_time) / 60.0 / 60.0
    log.debug(f"The time taken to visualize the statistics is {time_elapsed_seconds} seconds, "
              f"{time_elapsed_minutes} minutes, {time_elapsed_hours} hours")


Call data analysis function to check for any Tweets that aren't associated with a company at all.<br>



In [14]:
    # # Determine the number of non-Company associated Tweets.
    count_nan_non_nan(
        "D:/Dropbox/summer-research-2019/jupyter-notebooks/attribute-datasets/twitter-dataset-7-10-19-with-irrelevant-tweets-excluded.csv",
        ["company_derived"], "csv")


The number of rows (examples) in the dataframe is 654618
The number of columns (attributes) in the dataframe is 37

The number of NaN rows for "company_derived" is 0
The number of non-NaN rows for "company_derived" is 654618




It appears that 1,057 Tweets are not associated with any companies.  We may consider dropping those from our dataset in the future for SLO topic extraction or stance/sentiment analysis via machine learning.<br>



## Tweet Company Assignment Counts:


The # of Tweets associated with each company in our dataset.<br>



In [15]:
    total_tweets = tweet_dataframe.shape[0]
    adani = tweet_dataframe.loc[tweet_dataframe["company_derived_designation"] == "adani"]
    multiple = tweet_dataframe.loc[tweet_dataframe["company_derived_designation"] == "multiple"]
    bhp = tweet_dataframe.loc[tweet_dataframe["company_derived_designation"] == "bhp"]
    cuesta = tweet_dataframe.loc[tweet_dataframe["company_derived_designation"] == "cuesta"]
    fortescue = tweet_dataframe.loc[tweet_dataframe["company_derived_designation"] == "fortescue"]
    iluka = tweet_dataframe.loc[tweet_dataframe["company_derived_designation"] == "iluka"]
    newmont = tweet_dataframe.loc[tweet_dataframe["company_derived_designation"] == "newmont"]
    oilsearch = tweet_dataframe.loc[tweet_dataframe["company_derived_designation"] == "oilsearch"]
    riotinto = tweet_dataframe.loc[tweet_dataframe["company_derived_designation"] == "riotinto"]
    santos = tweet_dataframe.loc[tweet_dataframe["company_derived_designation"] == "santos"]
    whitehaven = tweet_dataframe.loc[tweet_dataframe["company_derived_designation"] == "whitehaven"]
    woodside = tweet_dataframe.loc[tweet_dataframe["company_derived_designation"] == "woodside"]

    print(f"The total number of Tweets in the dataset is: {total_tweets}")
    print(f"The number of Tweets associated with adani is {adani.shape[0]}")
    print(f"The number of Tweets associated with multiple is {multiple.shape[0]}")
    print(f"The number of Tweets associated with bhp is {bhp.shape[0]}")
    print(f"The number of Tweets associated with cuesta is {cuesta.shape[0]}")
    print(f"The number of Tweets associated with fortescue is {fortescue.shape[0]}")
    print(f"The number of Tweets associated with iluka is {iluka.shape[0]}")
    print(f"The number of Tweets associated with newmont is {newmont.shape[0]}")
    print(f"The number of Tweets associated with oilsearch is {oilsearch.shape[0]}")
    print(f"The number of Tweets associated with riotinto is {riotinto.shape[0]}")
    print(f"The number of Tweets associated with santos is {santos.shape[0]}")
    print(f"The number of Tweets associated with whitehaven is {whitehaven.shape[0]}")
    print(f"The number of Tweets associated with woodside is {woodside.shape[0]}")

The total number of Tweets in the dataset is: 654618
The number of Tweets associated with adani is 424437
The number of Tweets associated with multiple is 8205
The number of Tweets associated with bhp is 63039
The number of Tweets associated with cuesta is 101
The number of Tweets associated with fortescue is 11646
The number of Tweets associated with iluka is 2961
The number of Tweets associated with newmont is 888
The number of Tweets associated with oilsearch is 1554
The number of Tweets associated with riotinto is 25371
The number of Tweets associated with santos is 86279
The number of Tweets associated with whitehaven is 14718
The number of Tweets associated with woodside is 15419



Most Tweets are associated with Adani.<br>

