Merge pull request #320 from CamDavidsonPilon/lots-of-docs-edits

lots of doc updates
CamDavidsonPilon · Jul 19, 2017 · 8995d59 · 8995d59
2 parents 466b247 + d7eba48
commit 8995d59
Show file tree

Hide file tree

Showing 6 changed files with 158 additions and 192 deletions.
diff --git a/docs/Examples.rst b/docs/Examples.rst
@@ -8,7 +8,7 @@ More Examples and Recipes
 This section goes through some examples and recipes to help you use *lifelines*. 
 
 
-Compare two populations statistically
+Statistically compare two populations
 ##############################################
 
 (though this applies just as well to Nelson-Aalen estimates). Often researchers want to compare
@@ -37,7 +37,7 @@ compares whether the "death" generation process of the two populations are equal
     
     from lifelines.statistics import logrank_test
 
-    results = logrank_test(T1, T2, event_observed_A=C1, event_observed_B=C2)
+    results = logrank_test(T1, T2, event_observed_A=E1, event_observed_B=E2)
     results.print_summary()
 
     """
@@ -147,13 +147,13 @@ time (months, days, ...)      observed deaths       censored
     import pandas as pd
     
     # your argument in the function call below will be different
-    df = pd.read_csv('file.csv', index_cols=[0], columns = ['observed deaths', 'censored'] )
+    df = pd.read_csv('file.csv', index_cols=[0], columns = ['observed deaths', 'censored'])
 
     from lifelines.utils import survival_events_from_table
 
-    T,C = survival_events_from_table(df, observed_deaths_col='observed deaths', censored_col='censored')
+    T, E = survival_events_from_table(df, observed_deaths_col='observed deaths', censored_col='censored')
     print T # np.array([0,0,0,0,0,0,0,1,2,2, ...])
-    print C # np.array([1,1,1,1,1,1,1,0,1,1, ...])
+    print E # np.array([1,1,1,1,1,1,1,0,1,1, ...])
 
 
 Alternatively, perhaps you are interested in viewing the survival table given some durations and censorship vectors.
@@ -163,7 +163,7 @@ Alternatively, perhaps you are interested in viewing the survival table given so
     
     from lifelines.utils import survival_table_from_events
 
-    table = survival_table_from_events(T, C)
+    table = survival_table_from_events(T, E)
     print table.head()
     
     """
@@ -192,7 +192,7 @@ When `.plot` is called, an `axis` object is returned which can be passed into fu
     ax = kmf.plot(ax=ax)
 
 
-If you have a pandas `DataFrame` with columns "group", "T", and "C", then something like the following would work:
+If you have a pandas `DataFrame` with columns "group", "T", and "E", then something like the following would work:
 
 .. code-block:: python
     
@@ -204,7 +204,7 @@ If you have a pandas `DataFrame` with columns "group", "T", and "C", then someth
     kmf = KaplanMeierFitter()
     for group in df['group'].unique():
         data = grouped_data.get_group(group)
-        kmf.fit(data["T"], data["C"], label=group)
+        kmf.fit(data["T"], data["E"], label=group)
         kmf.plot(ax=ax)
     
 
@@ -219,7 +219,7 @@ Standard
 .. code-block:: python
     
     kmf = KaplanMeierFitter()
-    kmf.fit(T,C,label="kmf.plot()")
+    kmf.fit(T, E, label="kmf.plot()")
     kmf.plot()
 
 .. image:: /images/normal_plot.png 
@@ -243,7 +243,7 @@ Show censorships
 
 .. code-block:: python
 
-    kmf.fit(T,C,label="kmf.plot(show_censors=True)")
+    kmf.fit(T, C, label="kmf.plot(show_censors=True)")
     kmf.plot(show_censors=True)
 
 .. image:: images/show_censors_plot.png 
@@ -358,8 +358,8 @@ Below is a way to get an example dataset from a relational database (this may va
 
     SELECT 
       id, 
-      DATEDIFF('dd', started_at, COALESCE(ended_at, CURRENT_DATE) ) AS "T", 
-      (ended_at IS NOT NULL) AS "C" 
+      DATEDIFF('dd', started_at, COALESCE(ended_at, CURRENT_DATE)) AS "T", 
+      (ended_at IS NOT NULL) AS "E" 
     FROM some_tables
 
 Explanation
@@ -369,16 +369,11 @@ Each row is an `id`, a duration, and a boolean indicating whether the event occu
 "True" if the event *did* occur, that is, `ended_at` is filled in (we observed the `ended_at`). Ex: 
 
 ==================   ============   ============
-id                   T                      C
+id                   T                      E
 ==================   ============   ============
 10                   40                 True
 11                   42                 False
 12                   42                 False 
 13                   36                 True
 14                   33                 True
 ==================   ============   ============
-
-
-
-
-
diff --git a/docs/Quickstart.rst b/docs/Quickstart.rst
@@ -52,9 +52,9 @@ Let's start by importing some data. We need the durations that individuals are o
 
     from lifelines import KaplanMeierFitter
     kmf = KaplanMeierFitter()
-    kmf.fit(T, event_observed=E)  # more succiently, kmf.fit(T, E)
+    kmf.fit(T, event_observed=E)  # or, more succiently, kmf.fit(T, E)
 
-After calling the ``fit`` method, we have access to new properties like ``survival_function_`` and methods like ``plot()``. The latter is a wrapper around Pandas internal plotting library. 
+After calling the ``fit`` method, we have access to new properties like ``survival_function_`` and methods like ``plot()``. The latter is a wrapper around Panda's internal plotting library. 
 
 .. code:: python
     
@@ -102,15 +102,15 @@ Often you'll have data that looks like:
 
 *start_time*, *end_time*
 
-Lifelines has some utility functions to transform this dataset into durations and censorships:
+Lifelines has some utility functions to transform this dataset into duration and censorship vectors:
 
 .. code:: python
     
     from lifelines.utils import datetimes_to_durations
 
     # start_times is a vector of datetime objects
     # end_times is a vector of (possibly missing) datetime objects. 
-    T, C = datetimes_to_durations(start_times, end_times, freq='h')
+    T, E = datetimes_to_durations(start_times, end_times, freq='h')
 
 
 Alternatively, perhaps you are interested in viewing the survival table given some durations and censorship vectors.
@@ -137,7 +137,7 @@ Alternatively, perhaps you are interested in viewing the survival table given so
 Survival Regression
 -------------------
 
-While the above ``KaplanMeierFitter`` and ``NelsonAalenFitter`` are useful, they only give us an "average" view of the population. Often we have specific data at the individual level, either continuous or categorical, that we would like to use. For this, we turn to **survival regression**, specifically ``AalenAdditiveFitter`` or ``CoxPHFitter``.
+While the above ``KaplanMeierFitter`` and ``NelsonAalenFitter`` are useful, they only give us an "average" view of the population. Often we have specific data at the individual level, either continuous or categorical, that we would like to use. For this, we turn to **survival regression**, specifically ``AalenAdditiveFitter`` and ``CoxPHFitter``.
 
 .. code:: python
     
@@ -147,29 +147,51 @@ While the above ``KaplanMeierFitter`` and ``NelsonAalenFitter`` are useful, they
     regression_dataset.head()
 
 
-
-The input of the ``fit`` method's API on ``AalenAdditiveFitter`` is different than above. All the data, including durations, censorships and covariates must be contained in **a Pandas DataFrame** (yes, it must be a DataFrame). The duration column and event occured column must be specified in the call to ``fit``. 
+The input of the ``fit`` method's API in a regression is different. All the data, including durations, censorships and covariates must be contained in **a Pandas DataFrame** (yes, it must be a DataFrame). The duration column and event occured column must be specified in the call to ``fit``. 
 
 .. code:: python
     
-    from lifelines import AalenAdditiveFitter, CoxPHFitter
+    from lifelines import CoxPHFitter
 
     # Using Cox Proportional Hazards model
-    cf = CoxPHFitter()
-    cf.fit(regression_dataset, 'T', event_col='E')
-    cf.print_summary()
+    cph = CoxPHFitter()
+    cph.fit(regression_dataset, 'T', event_col='E')
+    cph.print_summary()
+
+    """
+    n=200, number of events=189
+
+           coef  exp(coef)  se(coef)      z      p  lower 0.95  upper 0.95
+    var1 0.2213     1.2477    0.0743 2.9796 0.0029      0.0757      0.3669  **
+    var2 0.0509     1.0522    0.0829 0.6139 0.5393     -0.1116      0.2134
+    var3 0.2186     1.2443    0.0758 2.8836 0.0039      0.0700      0.3672  **
+    ---
+    Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+    Concordance = 0.580
+    """
+
+    cph.plot()
+
+.. image:: http://i.imgur.com/ko1tzcCl.png
+
+
+If we focus on Aalen's Additive model, 
+
+.. code:: python
 
     # Using Aalen's Additive model
+    from lifelines import AalenAdditiveFitter
     aaf = AalenAdditiveFitter(fit_intercept=False)
     aaf.fit(regression_dataset, 'T', event_col='E')
 
 
-After fitting, you'll have access to properties like ``cumulative_hazards_`` and methods like ``plot``, ``predict_cumulative_hazards``, and ``predict_survival_function``. The latter two methods require an additional argument of individual covariates:
+Like ``CoxPHFitter``, after fitting you'll have access to properties like ``cumulative_hazards_`` and methods like ``plot``, ``predict_cumulative_hazards``, and ``predict_survival_function``. The latter two methods require an additional argument of individual covariates:
 
 .. code:: python
     
-    x = regression_dataset[regression_dataset.columns - ['E', 'T']]
-    aaf.predict_survival_function(x.iloc[10:12]).plot()  # get the unique survival functions of the first two subjects 
+    X = regression_dataset.drop(['E', 'T'], axis=1)
+    aaf.predict_survival_function(X.iloc[10:12]).plot()  # get the unique survival functions of two subjects 
 
 .. image:: images/quickstart_predict_aaf.png